import streamlit as st import torch from transformers import BertForSequenceClassification, BertTokenizerFast from transformers import AutoModelForSequenceClassification, AutoTokenizer import time import pandas as pd import base64 from PIL import Image, ImageDraw, ImageFont import io import streamlit.components.v1 as components # Set page configuration st.set_page_config( page_title="SMS Spam Guard", page_icon="🛡️", layout="wide", initial_sidebar_state="expanded" ) # New function to create a tech-themed Spam Guard logo def create_spam_guard_logo(): width, height = 200, 200 img = Image.new('RGBA', (width, height), (0,0,0,0)) # Transparent background draw = ImageDraw.Draw(img) # Flat Design Colors (slightly adjusted for modern flat look) primary_blue = (20, 120, 220) # A strong, modern blue accent_green = (0, 200, 150) # A vibrant, techy teal/green light_accent_blue = (100, 180, 240) # Lighter blue for highlights or secondary elements white_color = (255, 255, 255) dark_gray_text = (50, 50, 50) # For subtle text if needed # Background: A subtle gradient or a clean shape # Option 1: Clean circle as base # draw.ellipse([(10, 10), (width - 10, height - 10)], fill=primary_blue) # Option 2: Modern, slightly rounded rectangle or abstract shape # For a more abstract, less shield-like, but still contained feel: # Let's try a stylized hexagon or a shape made of intersecting elements. # Design: Abstract interlocking shapes suggesting SG or a data block / shield # Main body - a dynamic shape path = [ (width * 0.15, height * 0.2), (width * 0.85, height * 0.2), (width * 0.75, height * 0.8), (width * 0.25, height * 0.8) ] draw.polygon(path, fill=primary_blue) # Accent element (e.g., a stylized 'S' or a connecting line) draw.line([ (width * 0.3, height * 0.35), (width * 0.7, height * 0.35), (width * 0.7, height * 0.5), (width * 0.3, height * 0.5), (width * 0.3, height * 0.65), (width * 0.7, height * 0.65) ], fill=accent_green, width=18, joint="miter") # Adding a subtle highlight or secondary shape for depth (still flat) draw.polygon([ (width * 0.18, height * 0.22), (width * 0.82, height * 0.22), (width * 0.72, height * 0.78), (width * 0.28, height * 0.78) ], outline=light_accent_blue, width=4) # Text "SG" - Clean, modern, sans-serif font try: # Attempt to load a more modern, geometric font if available # For example, 'Montserrat-Bold.ttf' or 'Roboto-Medium.ttf' # If not, Arial Bold is a safe fallback. font = ImageFont.truetype("arialbd.ttf", 70) # Arial Bold as a fallback except IOError: font = ImageFont.load_default() # Fallback text = "SG" text_bbox = draw.textbbox((0,0), text, font=font) text_width = text_bbox[2] - text_bbox[0] text_height = text_bbox[3] - text_bbox[1] text_x = (width - text_width) / 2 # text_y = (height - text_height) / 2 # Slightly adjust y if the accent green takes up visual center text_y = (height - text_height) / 2 + 5 # Adjusted to better center with the green shape # Make text white and prominent draw.text((text_x, text_y), text, font=font, fill=white_color) buffered = io.BytesIO() img.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode() return f"data:image/png;base64,{img_str}" # Custom CSS for styling with China Mobile colors st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def load_language_model(): """Load the language detection model""" model_name = "papluca/xlm-roberta-base-language-detection" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) return tokenizer, model @st.cache_resource def load_spam_model(): """Load the fine-tuned BERT spam detection model""" model_path = "chjivan/final" tokenizer = BertTokenizerFast.from_pretrained(model_path) model = BertForSequenceClassification.from_pretrained(model_path) return tokenizer, model def detect_language(text, tokenizer, model): """Detect the language of the input text""" inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits probabilities = torch.softmax(logits, dim=1)[0] predicted_class_id = torch.argmax(probabilities).item() predicted_language = model.config.id2label[predicted_class_id] confidence = probabilities[predicted_class_id].item() top_3_indices = torch.topk(probabilities, 3).indices.tolist() top_3_probs = torch.topk(probabilities, 3).values.tolist() top_3_langs = [(model.config.id2label[idx], prob) for idx, prob in zip(top_3_indices, top_3_probs)] return predicted_language, confidence, top_3_langs def classify_spam(text, tokenizer, model): """Classify the input text as spam or ham""" inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits probabilities = torch.softmax(logits, dim=1)[0] predicted_class_id = torch.argmax(probabilities).item() confidence = probabilities[predicted_class_id].item() is_spam = predicted_class_id == 1 return is_spam, confidence # Get the new Spam Guard logo logo_data = create_spam_guard_logo() # Add custom CSS animations (ensure this is defined before use) st.markdown(""" """, unsafe_allow_html=True) # Load both models with st.spinner("Loading models... This may take a moment."): lang_tokenizer, lang_model = load_language_model() spam_tokenizer, spam_model = load_spam_model() # App Header with new logo st.markdown(f"""

SMS Spam Guard

Intelligent SMS Filtering Assistant by China Mobile Communications Group Co.,Ltd

""", unsafe_allow_html=True) # Create a two-column layout col1, col2 = st.columns([1, 2]) # Adjusted column ratio for better balance # Sidebar content in col1 (styled as a card) with col1: st.markdown(f"""

About Us

China Mobile Communications Group Co.,Ltd provides intelligent communication security solutions to protect users from spam and fraudulent messages.

""", unsafe_allow_html=True) st.markdown("""

Our Technology

✅ Advanced AI-powered spam detection
🌐 Multi-language support
🔒 Secure and private processing
⚡ Real-time analysis

""", unsafe_allow_html=True) st.markdown("

Sample Messages

", unsafe_allow_html=True) # Sample buttons with unique keys and improved help text button_style = "margin-bottom: 8px; width: 100%;" if st.button("Sample Spam (English)", key="spam_btn_en", help="Load a sample English spam message", type="secondary"): st.session_state.sms_input = "URGENT: You have won a $1,000 Walmart gift card. Go to http://bit.ly/claim-prize to claim now before it expires!" if st.button("Sample Legitimate (English)", key="ham_btn_en", help="Load a sample English legitimate message", type="secondary"): st.session_state.sms_input = "Your Amazon package will be delivered today. Thanks for ordering from Amazon!" if st.button("Sample Message (French)", key="french_btn_fr", help="Load a sample French message", type="secondary"): st.session_state.sms_input = "Bonjour! Votre réservation pour le restaurant est confirmée pour ce soir à 20h. À bientôt!" if st.button("Sample Message (Spanish)", key="spanish_btn_es", help="Load a sample Spanish message", type="secondary"): st.session_state.sms_input = "Hola, tu cita médica está programada para mañana a las 10:00. Por favor llega 15 minutos antes." # Main content in col2 with col2: st.markdown("""

Analyze Your Message

""", unsafe_allow_html=True) sms_input = st.text_area( "", # Label removed for cleaner look, relying on header value=st.session_state.get("sms_input", ""), height=120, # Increased height key="sms_input", placeholder="Enter the SMS message you want to analyze here...", help="Paste or type the SMS message to check if it's spam or legitimate." ) analyze_button = st.button("📱 Analyze Message", use_container_width=True, key="analyze_btn", type="primary") if analyze_button and sms_input: with st.spinner(""): st.markdown("""

Analyzing your message...

""", unsafe_allow_html=True) time.sleep(0.5) # Simulate some work lang_start_time = time.time() lang_code, lang_confidence, top_langs = detect_language(sms_input, lang_tokenizer, lang_model) lang_time = time.time() - lang_start_time lang_names = { "ar": "Arabic", "bg": "Bulgarian", "de": "German", "el": "Greek", "en": "English", "es": "Spanish", "fr": "French", "hi": "Hindi", "it": "Italian", "ja": "Japanese", "nl": "Dutch", "pl": "Polish", "pt": "Portuguese", "ru": "Russian", "sw": "Swahili", "th": "Thai", "tr": "Turkish", "ur": "Urdu", "vi": "Vietnamese", "zh": "Chinese" } lang_name = lang_names.get(lang_code, lang_code.capitalize()) spam_start_time = time.time() is_spam, spam_confidence = classify_spam(sms_input, spam_tokenizer, spam_model) spam_time = time.time() - spam_start_time st.markdown("

Analysis Results

", unsafe_allow_html=True) res_col1, res_col2 = st.columns(2) with res_col1: st.markdown(f"""

📊 Language Detection

{lang_name} Detected with {lang_confidence:.1%} confidence

Top language probabilities:

{lang_names.get(l_code, l_code.capitalize())}: {l_prob:.1%}

⏱️ Processing time: {lang_time:.3f} seconds

""", unsafe_allow_html=True) with res_col2: result_confidence = spam_confidence if is_spam else (1 - spam_confidence) if is_spam: st.markdown(f"""

🔍 Spam Detection

⚠️ SPAM DETECTED

Confidence: {result_confidence:.1%}

This message shows strong indicators of being spam.

⏱️ Processing time: {spam_time:.3f} seconds

""", unsafe_allow_html=True) else: st.markdown(f"""

🔍 Spam Detection

✅ LEGITIMATE MESSAGE

Confidence: {result_confidence:.1%}

This message appears to be legitimate.

⏱️ Processing time: {spam_time:.3f} seconds

""", unsafe_allow_html=True) st.markdown("""

📋 Summary & Recommendations

""", unsafe_allow_html=True) if is_spam: st.warning("📵 **Recommended Action**: This message should be treated with caution, blocked, or moved to the spam folder.") st.markdown(""" **Potential reasons for spam classification:**

Contains suspicious language patterns or urgency.
May include unsolicited offers or links to untrusted sites.
Resembles known spam message structures.

""", unsafe_allow_html=True) else: st.success("✅ **Recommended Action**: This message seems safe and can be delivered to the inbox.") st.markdown("

", unsafe_allow_html=True) st.markdown("""

📈 Confidence Visualization

""", unsafe_allow_html=True) chart_data = pd.DataFrame({ 'Task': ['Language Detection Confidence', 'Spam Classification Certainty'], 'Confidence': [lang_confidence, result_confidence] }) st.bar_chart(chart_data.set_index('Task'), height=300, use_container_width=True) st.markdown("

", unsafe_allow_html=True) # Footer st.markdown(f""" """, unsafe_allow_html=True)