Spaces:

ramadn
/

allergen_detector_bert

Sleeping

App Files Files Community

rdsarjito commited on Apr 30

Commit

9de5935

1 Parent(s): 314c91a

5 commit

Browse files

Files changed (3) hide show

app.py +138 -156
model_loader.py +0 -58
requirements.txt +5 -9

app.py CHANGED Viewed

@@ -2,79 +2,24 @@ import streamlit as st
 import torch
 import torch.nn as nn
 import re
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import numpy as np
 import os
-# Set page configuration
 st.set_page_config(
-    page_title="Allergen Detector",
-    page_icon="🍽️",
     layout="wide"
 )
-# Define styling
-st.markdown("""
-<style>
-    .main-header {
-        font-size: 2.5rem;
-        color: #1E88E5;
-        text-align: center;
-    }
-    .sub-header {
-        font-size: 1.5rem;
-        color: #424242;
-        margin-bottom: 1rem;
-    }
-    .result-positive {
-        font-size: 1.2rem;
-        color: #D32F2F;
-        font-weight: bold;
-    }
-    .result-negative {
-        font-size: 1.2rem;
-        color: #388E3C;
-        font-weight: bold;
-    }
-    .footer {
-        text-align: center;
-        color: #616161;
-        margin-top: 2rem;
-    }
-</style>
-""", unsafe_allow_html=True)
-# App title and description
-st.markdown("<h1 class='main-header'>Allergen Detector</h1>", unsafe_allow_html=True)
-st.markdown("<p class='sub-header'>Detect common allergens in your recipe ingredients</p>", unsafe_allow_html=True)
 # Set device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Target columns (allergen types)
 target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
-allergen_display_names = {
-    'susu': 'Milk (Susu)',
-    'kacang': 'Nuts (Kacang)',
-    'telur': 'Eggs (Telur)',
-    'makanan_laut': 'Seafood (Makanan Laut)',
-    'gandum': 'Wheat (Gandum)'
-}
-# Define model for multilabel classification
-class MultilabelBertClassifier(nn.Module):
-    def __init__(self, model_name, num_labels):
-        super(MultilabelBertClassifier, self).__init__()
-        self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
-        # Replace the classification head with our own for multilabel
-        self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
-    def forward(self, input_ids, attention_mask):
-        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
-        return outputs.logits
 # Clean text function
-@st.cache_data
 def clean_text(text):
     # Convert dashes to spaces for better tokenization
     text = text.replace('--', ' ')
@@ -87,38 +32,51 @@ def clean_text(text):
     text = text.lower()
     return text
-# Function to load model
 @st.cache_resource
 def load_model():
-    try:
-        # Initialize tokenizer
-        tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
-        # Initialize model
-        model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
-        # Load the trained model
-        # In a real deployment, you would use the saved model file
-        # For demo purposes, we'll assume the model file is in the same directory
-        model_path = "model/alergen_model.pt"
-        if os.path.exists(model_path):
-            checkpoint = torch.load(model_path, map_location=device)
             model.load_state_dict(checkpoint['model_state_dict'])
         else:
-            st.error("Model file not found. Please make sure 'alergen_model.pt' is in the same directory.")
         model.to(device)
         model.eval()
         return model, tokenizer
-    except Exception as e:
-        st.error(f"Error loading model: {str(e)}")
-        return None, None
 # Function to predict allergens
 def predict_allergens(model, tokenizer, ingredients_text, max_length=128):
-    if not model or not tokenizer:
         return {}
     # Clean the text
@@ -140,103 +98,127 @@ def predict_allergens(model, tokenizer, ingredients_text, max_length=128):
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
         predictions = torch.sigmoid(outputs)
-        probabilities = predictions.cpu().numpy()[0]
-        binary_predictions = (probabilities > 0.5).astype(bool)
-    result = {
-        'binary': {},
-        'probabilities': {}
-    }
     for i, target in enumerate(target_columns):
-        result['binary'][target] = bool(binary_predictions[i])
-        result['probabilities'][target] = float(probabilities[i])
     return result
-# Main app
 def main():
-    # Load model and tokenizer
     model, tokenizer = load_model()
     # Input area
-    st.markdown("### Enter Recipe Ingredients")
-    ingredients = st.text_area(
-        "Paste your recipe ingredients here:",
-        height=200,
-        placeholder="Example: 1 bungkus Lontong homemade, 2 butir Telur ayam, 2 kotak kecil Tahu coklat..."
-    )
-    # Sample recipe option
-    use_sample = st.checkbox("Use sample recipe")
-    if use_sample:
-        sample_recipe = "1 bungkus Lontong homemade 2 butir Telur ayam 2 kotak kecil Tahu coklat 4 butir kecil Kentang 2 buah Tomat merah 1 buah Ketimun lalap 4 lembar Selada keriting 2 lembar Kol putih 2 porsi Saus kacang homemade 4 buah Kerupuk udang goreng Secukupnya emping goreng 2 sdt Bawang goreng Secukupnya Kecap manis (bila suka)"
-        ingredients = sample_recipe
-        st.text_area("Sample recipe:", value=sample_recipe, height=150, disabled=True)
-    # Analyze button
-    analyze_button = st.button("Analyze Ingredients")
-    # Results section
-    if analyze_button and ingredients:
-        with st.spinner("Analyzing ingredients..."):
-            # Make prediction
-            results = predict_allergens(model, tokenizer, ingredients)
-            if results:
-                st.markdown("### Analysis Results")
-                # Display results in columns
                 col1, col2 = st.columns(2)
                 with col1:
-                    st.markdown("#### Detected Allergens:")
-                    # Check if any allergens were detected
-                    if any(results['binary'].values()):
-                        for allergen, present in results['binary'].items():
-                            if present:
-                                st.markdown(f"<p class='result-positive'>✓ {allergen_display_names[allergen]}</p>", unsafe_allow_html=True)
-                    else:
-                        st.markdown("<p class='result-negative'>No allergens detected</p>", unsafe_allow_html=True)
                 with col2:
-                    st.markdown("#### Confidence Scores:")
-                    for allergen, probability in results['probabilities'].items():
-                        # Create a progress bar for each allergen
-                        st.write(f"{allergen_display_names[allergen]}")
-                        st.progress(probability)
-                        st.write(f"{probability:.2%}")
-                        st.write("")
-                # Display a summary
-                st.markdown("### Summary")
-                detected = [allergen_display_names[a] for a, p in results['binary'].items() if p]
-                if detected:
-                    st.warning(f"This recipe contains the following allergens: {', '.join(detected)}")
-                else:
-                    st.success("This recipe appears to be free from the common allergens we can detect.")
-                st.info("Note: This analysis is based on an AI model and may not be 100% accurate. Always verify allergen information from trusted sources if you have dietary restrictions.")
-    # Information section
-    with st.expander("About This App"):
         st.write("""
-        This allergen detector uses a fine-tuned IndoBERT model to identify common allergens in recipe ingredients.
-        The model can detect the following allergens:
-        - Milk (Susu)
-        - Nuts (Kacang)
-        - Eggs (Telur)
-        - Seafood (Makanan Laut)
-        - Wheat (Gandum)
-        The accuracy of detection depends on how clearly the ingredients are described. The model has been trained on Indonesian recipe data.
         """)
-    # Footer
-    st.markdown("<p class='footer'>Developed with ❤️ using Streamlit and PyTorch</p>", unsafe_allow_html=True)
 if __name__ == "__main__":
     main()

 import torch
 import torch.nn as nn
 import re
+from transformers import AutoTokenizer
 import os
+import numpy as np
+# Set page config
 st.set_page_config(
+    page_title="Allergen Detection App",
+    page_icon="🍲",
     layout="wide"
 )
 # Set device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Define target columns (allergens)
 target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
 # Clean text function
 def clean_text(text):
     # Convert dashes to spaces for better tokenization
     text = text.replace('--', ' ')
     text = text.lower()
     return text
+# Define model for multilabel classification
+class MultilabelBertClassifier(nn.Module):
+    def __init__(self, model_name, num_labels):
+        super(MultilabelBertClassifier, self).__init__()
+        # Replace with a simpler initialization for inference only
+        from transformers import AutoConfig, AutoModel
+        self.config = AutoConfig.from_pretrained(model_name)
+        self.bert = AutoModel.from_pretrained(model_name)
+        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
+        return self.classifier(pooled_output)
+# Load model function
 @st.cache_resource
 def load_model():
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p2')
+    # Initialize model
+    model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
+    # Check if model exists
+    model_path = "model/alergen_model.pt"
+    if os.path.exists(model_path):
+        # Load model weights
+        checkpoint = torch.load(model_path, map_location=device)
+        if 'model_state_dict' in checkpoint:
             model.load_state_dict(checkpoint['model_state_dict'])
         else:
+            model.load_state_dict(checkpoint)
         model.to(device)
         model.eval()
         return model, tokenizer
+    else:
+        st.error("Model file not found. Please upload the model file.")
+        return None, tokenizer
 # Function to predict allergens
 def predict_allergens(model, tokenizer, ingredients_text, max_length=128):
+    if not model:
         return {}
     # Clean the text
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
         predictions = torch.sigmoid(outputs)
+        predictions = (predictions > 0.5).float().cpu().numpy()[0]
+    result = {}
     for i, target in enumerate(target_columns):
+        result[target] = bool(predictions[i])
     return result
+# UI components
 def main():
+    st.title("🍲 Allergen Detection in Indonesian Recipes")
+    st.write("This app predicts common allergens in your recipe based on ingredients.")
+    # Sidebar for model upload
+    with st.sidebar:
+        st.header("Model Settings")
+        uploaded_model = st.file_uploader("Upload model file (model/alergen_model.pt)", type=["pt"])
+        if uploaded_model:
+            # Save uploaded model
+            with open("model/alergen_model.pt", "wb") as f:
+                f.write(uploaded_model.getbuffer())
+            st.success("Model uploaded successfully!")
+        st.markdown("---")
+        st.write("Allergen Categories:")
+        for allergen in target_columns:
+            if allergen == 'susu':
+                st.write("- Susu (Milk)")
+            elif allergen == 'kacang':
+                st.write("- Kacang (Nuts)")
+            elif allergen == 'telur':
+                st.write("- Telur (Eggs)")
+            elif allergen == 'makanan_laut':
+                st.write("- Makanan Laut (Seafood)")
+            elif allergen == 'gandum':
+                st.write("- Gandum (Wheat/Gluten)")
+    # Load model
     model, tokenizer = load_model()
     # Input area
+    st.header("Recipe Ingredients")
+    # Example button
+    if st.button("Load Example"):
+        example_text = "1 bungkus Lontong homemade 2 butir Telur ayam 2 kotak kecil Tahu coklat 4 butir kecil Kentang 2 buah Tomat merah 1 buah Ketimun lalap 4 lembar Selada keriting 2 lembar Kol putih 2 porsi Saus kacang homemade 4 buah Kerupuk udang goreng Secukupnya emping goreng 2 sdt Bawang goreng Secukupnya Kecap manis (bila suka)"
+        st.session_state.ingredients = example_text
+    # Text input
+    ingredients_text = st.text_area(
+        "Enter recipe ingredients (in Indonesian):",
+        height=150,
+        key="ingredients"
+    )
+    # Predict button
+    if st.button("Detect Allergens"):
+        if ingredients_text.strip() == "":
+            st.warning("Please enter ingredients first.")
+        elif model is None:
+            st.error("Please upload the model file first.")
+        else:
+            with st.spinner("Analyzing ingredients..."):
+                # Make prediction
+                allergens = predict_allergens(model, tokenizer, ingredients_text)
+                # Display results
+                st.header("Results")
+                # Create columns for results
                 col1, col2 = st.columns(2)
                 with col1:
+                    st.subheader("Detected Allergens:")
+                    has_allergens = False
+                    for allergen, present in allergens.items():
+                        if present:
+                            has_allergens = True
+                            if allergen == 'susu':
+                                st.warning("🥛 Susu (Milk)")
+                            elif allergen == 'kacang':
+                                st.warning("🥜 Kacang (Nuts)")
+                            elif allergen == 'telur':
+                                st.warning("🥚 Telur (Eggs)")
+                            elif allergen == 'makanan_laut':
+                                st.warning("🦐 Makanan Laut (Seafood)")
+                            elif allergen == 'gandum':
+                                st.warning("🌾 Gandum (Wheat/Gluten)")
+                    if not has_allergens:
+                        st.success("✅ No allergens detected!")
                 with col2:
+                    st.subheader("All Categories:")
+                    for allergen, present in allergens.items():
+                        if allergen == 'susu':
+                            st.write("🥛 Susu (Milk): " + ("Detected ⚠️" if present else "Not detected ✓"))
+                        elif allergen == 'kacang':
+                            st.write("🥜 Kacang (Nuts): " + ("Detected ⚠️" if present else "Not detected ✓"))
+                        elif allergen == 'telur':
+                            st.write("🥚 Telur (Eggs): " + ("Detected ⚠️" if present else "Not detected ✓"))
+                        elif allergen == 'makanan_laut':
+                            st.write("🦐 Makanan Laut (Seafood): " + ("Detected ⚠️" if present else "Not detected ✓"))
+                        elif allergen == 'gandum':
+                            st.write("🌾 Gandum (Wheat/Gluten): " + ("Detected ⚠️" if present else "Not detected ✓"))
+                # Show cleaned text
+                with st.expander("Processed Text"):
+                    st.code(clean_text(ingredients_text))
+    # Instructions and information
+    with st.expander("How to Use"):
         st.write("""
+        1. First, upload the trained model file (`model/alergen_model.pt`) using the sidebar uploader
+        2. Enter your recipe ingredients in the text box (in Indonesian)
+        3. Click the "Detect Allergens" button to analyze the recipe
+        4. View the results showing which allergens are present in your recipe
+        The model detects five common allergen categories: milk, nuts, eggs, seafood, and wheat/gluten.
         """)
 if __name__ == "__main__":
     main()

model_loader.py DELETED Viewed

@@ -1,58 +0,0 @@
-import torch
-import torch.nn as nn
-from transformers import AutoModelForSequenceClassification
-# Define target columns
-target_columns = ['susu', 'kacang', 'telur', 'makanan_laut', 'gandum']
-# Define model class - same as in your original code
-class MultilabelBertClassifier(nn.Module):
-    def __init__(self, model_name, num_labels):
-        super(MultilabelBertClassifier, self).__init__()
-        self.bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
-        # Replace the classification head with our own for multilabel
-        self.bert.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
-    def forward(self, input_ids, attention_mask):
-        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
-        return outputs.logits
-# Function to load the saved model
-def load_saved_model(model_path, device='cpu'):
-    """
-    Load the saved allergen detection model
-    Args:
-        model_path (str): Path to the saved model file
-        device (str): Device to load the model onto ('cpu' or 'cuda')
-    Returns:
-        model: The loaded model
-    """
-    try:
-        # Create model instance
-        model = MultilabelBertClassifier('indobenchmark/indobert-base-p1', len(target_columns))
-        # Load saved weights
-        checkpoint = torch.load(model_path, map_location=device)
-        # Check if model was saved using DataParallel
-        if 'module.' in list(checkpoint['model_state_dict'].keys())[0]:
-            # Create new OrderedDict without 'module.' prefix
-            from collections import OrderedDict
-            new_state_dict = OrderedDict()
-            for k, v in checkpoint['model_state_dict'].items():
-                name = k[7:] if k.startswith('module.') else k
-                new_state_dict[name] = v
-            model.load_state_dict(new_state_dict)
-        else:
-            model.load_state_dict(checkpoint['model_state_dict'])
-        # Move model to device and set to evaluation mode
-        model.to(device)
-        model.eval()
-        return model
-    except Exception as e:
-        print(f"Error loading model: {str(e)}")
-        return None

requirements.txt CHANGED Viewed

@@ -1,9 +1,5 @@
-streamlit==1.30.0
-torch==2.0.1
-transformers==4.35.2
-numpy==1.24.3
-pandas==2.0.3
-scikit-learn==1.3.0
-regex==2023.8.8
-tqdm==4.66.1
-matplotlib==3.7.2

+streamlit>=1.25.0
+torch>=2.0.0
+transformers>=4.30.0
+numpy>=1.22.0
+protobuf>=3.20.0