Spaces:

AIEcosystem
/

Multilingual-PDF-DOCX-Entity-Finder

Sleeping

App Files Files Community

Maria Tsilimos commited on 11 days ago

Commit

e47c2d2

unverified ·

1 Parent(s): a3633fc

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -36

app.py CHANGED Viewed

@@ -7,12 +7,13 @@ import zipfile
 import os
 import re
 import numpy as np
 from cryptography.fernet import Fernet
-from gliner import GLiNER
-from PyPDF2 import PdfReader
-import docx
-from comet_ml import Experiment
 from streamlit_extras.stylable_container import stylable_container
 st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
@@ -26,16 +27,73 @@ comet_initialized = False
 if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
     comet_initialized = True
-# --- Initialize session state ---
 if 'file_upload_attempts' not in st.session_state:
-    st.session_state['file_upload_attempts'] = 0
 if 'encrypted_extracted_text' not in st.session_state:
     st.session_state['encrypted_extracted_text'] = None
-max_attempts = 10
 GLINER_LABELS_CATEGORIZED = {
     "Personal Identifiers": [
         "Person",
@@ -67,9 +125,7 @@ GLINER_LABELS_CATEGORIZED = {
     ],
     "Government & Official IDs": [
         "Passport number",
         "Social security number",
         "CPF",
         "Driver license number",
         "Tax identification number",
@@ -172,10 +228,7 @@ def decrypt_text(encrypted_bytes: bytes) -> str | None:
 st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
-expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**")
 expander.write(f'''
     **Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: "Person", "Organization", "Phone number", "Address", "Passport number", "Email", "Credit card number", "Social security number", "Health insurance ID number", "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF", "Driver license number", "Tax identification number", "Medical condition", "Identity card number", "National ID number", "IP address", "IBAN", "Credit card expiration date", "Username", "Health insurance number", "Registration number", "Student ID number", "Insurance number", "Flight number", "Landline phone number", "Blood type", "CVV", "Reservation number", "Digital signature", "Social media handle", "License plate number", "CNPJ", "Postal code", "Serial number", "Vehicle registration number", "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number", "Transaction number", "National health insurance number", "CVC", "Birth certificate number", "Train ticket number", "Passport expiration date"
@@ -185,7 +238,7 @@ expander.write(f'''
     **How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button to extract and tag entities in your text data.
-    **Usage Limits:** You can request results up to 10 times.
     **Language settings:** Please check and adjust the language settings in your computer, so the French, German, Spanish, Portuguese and Italian characters are handled properly in your downloaded file.
@@ -195,28 +248,49 @@ expander.write(f'''
     For any errors or inquiries, please contact us at info@nlpblogs.com
 ''')
 with st.sidebar:
-    container = st.container(border=True)
-    container.write("**Named Entity Recognition (NER)** is the task of "
-                    "extracting and tagging entities in text data. Entities can be persons, "
-                    "organizations, locations, countries, products, events etc.")
-    st.subheader("Related NER Web Apps", divider="orange")
-    st.link_button("Scandinavian JSON Entity Finder",
-                   "https://nlpblogs.com/shop/named-entity-recognition-ner/scandinavian-json-entity-finder/",
-                   type="primary")
 # --- File Upload (PDF/DOCX) ---
 uploaded_file = st.file_uploader("Upload your file. Accepted file formats include: .pdf, .docx", type=['pdf', 'docx'])
-# Initialize text for the current run outside the if uploaded_file block
 current_run_text = None
 if uploaded_file is not None:
     file_extension = uploaded_file.name.split('.')[-1].lower()
     if file_extension == 'pdf':
         try:
             pdf_reader = PdfReader(uploaded_file)
@@ -270,7 +344,9 @@ if st.button("Results"):
         st.warning("No extractable text content available for analysis. Please upload a valid PDF or DOCX file.")
         st.stop()
     st.session_state['file_upload_attempts'] += 1
     with st.spinner("Analyzing text...", show_time=True):
         model = load_ner_model()
@@ -301,7 +377,7 @@ if st.button("Results"):
         # --- Add 'category' column to the DataFrame based on the grouped labels ---
         df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
         # Handle cases where an entity_group might not have a category (shouldn't happen if maps are complete)
-        df['category'] = df['category'].fillna('Uncategorized')
         if comet_initialized:
             experiment = Experiment(
@@ -313,7 +389,6 @@ if st.button("Results"):
             experiment.log_table("predicted_entities", df)
             experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
         # --- Display Results ---
         st.subheader("Extracted Entities", divider="rainbow")
         properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
@@ -345,7 +420,6 @@ if st.button("Results"):
         for i, category_name in enumerate(category_names):
             with category_tabs[i]:
                 # Filter the main DataFrame for the current category
                 df_category_filtered = df[df['category'] == category_name]
@@ -367,7 +441,7 @@ if st.button("Results"):
         st.subheader("Tree map", divider="orange")
         # Update treemap path to include category
         fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
-                                 values='score', color='category') # Color by category for better visual distinction
         fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
         st.plotly_chart(fig_treemap)
         if comet_initialized:
@@ -380,7 +454,7 @@ if st.button("Results"):
         with col1:
             st.subheader("Pie Chart (by Entity Type)", divider="orange")
             fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
-                             hover_data=['count'], labels={'count': 'count'}, title='Percentage of Predicted Labels (Entity Types)')
             fig_pie.update_traces(textposition='inside', textinfo='percent+label')
             st.plotly_chart(fig_pie)
             if comet_initialized:
@@ -389,7 +463,7 @@ if st.button("Results"):
         with col2:
             st.subheader("Bar Chart (by Entity Type)", divider="orange")
             fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
-                             title='Occurrences of Predicted Labels (Entity Types)', orientation='h')
             fig_bar.update_layout(yaxis={'categoryorder':'total ascending'}) # Order bars
             st.plotly_chart(fig_bar)
             if comet_initialized:
@@ -399,11 +473,10 @@ if st.button("Results"):
         st.subheader("Entity Counts by Category", divider="orange")
         category_counts = df['category'].value_counts().reset_index().rename(columns={"index": "category", "count": "count"})
         fig_cat_bar = px.bar(category_counts, x="count", y="category", color="category", text_auto=True,
-                             title='Occurrences of Entities by Category', orientation='h')
         fig_cat_bar.update_layout(yaxis={'categoryorder':'total ascending'})
         st.plotly_chart(fig_cat_bar)
         # --- Downloadable Content ---
         dfa = pd.DataFrame(
             data={

 import os
 import re
 import numpy as np
+import json # Added to handle persistent data
 from cryptography.fernet import Fernet
+from gliner import GLiNER
+from PyPDF2 import PdfReader
+import docx
+from comet_ml import Experiment
 from streamlit_extras.stylable_container import stylable_container
 st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
 if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
     comet_initialized = True
+# --- Persistent Counter and History Configuration ---
+COUNTER_FILE = "counter_ner_app.json"
+HISTORY_FILE = "file_history_ner_app.json"
+max_attempts = 300
+# --- Functions to manage persistent data ---
+def load_attempts():
+    """
+    Loads the attempts count from a persistent JSON file.
+    Returns 0 if the file doesn't exist or is invalid.
+    """
+    if os.path.exists(COUNTER_FILE):
+        try:
+            with open(COUNTER_FILE, "r") as f:
+                data = json.load(f)
+                return data.get('file_upload_attempts', 0)
+        except (json.JSONDecodeError, KeyError):
+            return 0
+    return 0
+def save_attempts(attempts):
+    """
+    Saves the current attempts count to the persistent JSON file.
+    """
+    with open(COUNTER_FILE, "w") as f:
+        json.dump({'file_upload_attempts': attempts}, f)
+def load_history():
+    """
+    Loads the file upload history from a persistent JSON file.
+    Returns an empty list if the file doesn't exist or is invalid.
+    """
+    if os.path.exists(HISTORY_FILE):
+        try:
+            with open(HISTORY_FILE, "r") as f:
+                data = json.load(f)
+                return data.get('uploaded_files', [])
+        except (json.JSONDecodeError, KeyError):
+            return []
+    return []
+def save_history(history):
+    """
+    Saves the current file upload history to the persistent JSON file.
+    """
+    with open(HISTORY_FILE, "w") as f:
+        json.dump({'uploaded_files': history}, f)
+def clear_history_data():
+    """Clears the file history from session state and deletes the persistent file."""
+    if os.path.exists(HISTORY_FILE):
+        os.remove(HISTORY_FILE)
+    st.session_state['uploaded_files_history'] = []
+    st.rerun()
+# --- Initialize session state with persistent data ---
 if 'file_upload_attempts' not in st.session_state:
+    st.session_state['file_upload_attempts'] = load_attempts()
+    save_attempts(st.session_state['file_upload_attempts'])
+if 'uploaded_files_history' not in st.session_state:
+    st.session_state['uploaded_files_history'] = load_history()
+    save_history(st.session_state['uploaded_files_history'])
 if 'encrypted_extracted_text' not in st.session_state:
     st.session_state['encrypted_extracted_text'] = None
 GLINER_LABELS_CATEGORIZED = {
     "Personal Identifiers": [
         "Person",
     ],
     "Government & Official IDs": [
         "Passport number",
         "Social security number",
         "CPF",
         "Driver license number",
         "Tax identification number",
 st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
 st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
+expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**")
 expander.write(f'''
     **Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: "Person", "Organization", "Phone number", "Address", "Passport number", "Email", "Credit card number", "Social security number", "Health insurance ID number", "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF", "Driver license number", "Tax identification number", "Medical condition", "Identity card number", "National ID number", "IP address", "IBAN", "Credit card expiration date", "Username", "Health insurance number", "Registration number", "Student ID number", "Insurance number", "Flight number", "Landline phone number", "Blood type", "CVV", "Reservation number", "Digital signature", "Social media handle", "License plate number", "CNPJ", "Postal code", "Serial number", "Vehicle registration number", "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number", "Transaction number", "National health insurance number", "CVC", "Birth certificate number", "Train ticket number", "Passport expiration date"
     **How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button to extract and tag entities in your text data.
+    **Usage Limits:** You can request results up to 300 requests within a 30-day period.
     **Language settings:** Please check and adjust the language settings in your computer, so the French, German, Spanish, Portuguese and Italian characters are handled properly in your downloaded file.
     For any errors or inquiries, please contact us at info@nlpblogs.com
 ''')
 with st.sidebar:
+    # --- Added Persistent History Display ---
+    st.subheader("Your File Upload History", divider="orange")
+    if st.session_state['uploaded_files_history']:
+        history_df = pd.DataFrame(st.session_state['uploaded_files_history'])
+        st.dataframe(history_df, use_container_width=True, hide_index=True)
+        # Add a clear history button
+        if st.button("Clear File History", help="This will permanently delete the file history from the application."):
+            clear_history_data()
+    else:
+        st.info("You have not uploaded any files yet.")
+    st.subheader("Build your own NER Web App in a minute without writing a single line of code.", divider="orange")
+    st.link_button("NER File Builder",
+                    "https://nlpblogs.com/shop/named-entity-recognition-ner/ner-file-builder/",
+                    type="primary")
 # --- File Upload (PDF/DOCX) ---
 uploaded_file = st.file_uploader("Upload your file. Accepted file formats include: .pdf, .docx", type=['pdf', 'docx'])
 current_run_text = None
 if uploaded_file is not None:
     file_extension = uploaded_file.name.split('.')[-1].lower()
+    # Check if this file has already been processed and is the same as the last one
+    # This prevents re-adding the same file to history on every rerun of the app
+    if st.session_state['uploaded_files_history'] and uploaded_file.name == st.session_state['uploaded_files_history'][-1]['filename']:
+        # Do not re-add to history, just process the file
+        pass
+    else:
+        # --- ADDING TO UPLOAD HISTORY ---
+        new_upload_entry = {
+            "filename": uploaded_file.name,
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+        }
+        st.session_state['uploaded_files_history'].append(new_upload_entry)
+        save_history(st.session_state['uploaded_files_history'])
     if file_extension == 'pdf':
         try:
             pdf_reader = PdfReader(uploaded_file)
         st.warning("No extractable text content available for analysis. Please upload a valid PDF or DOCX file.")
         st.stop()
+    # Increment and save the attempts counter
     st.session_state['file_upload_attempts'] += 1
+    save_attempts(st.session_state['file_upload_attempts'])
     with st.spinner("Analyzing text...", show_time=True):
         model = load_ner_model()
         # --- Add 'category' column to the DataFrame based on the grouped labels ---
         df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
         # Handle cases where an entity_group might not have a category (shouldn't happen if maps are complete)
+        df['category'] = df['category'].fillna('Uncategorized')
         if comet_initialized:
             experiment = Experiment(
             experiment.log_table("predicted_entities", df)
             experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
         # --- Display Results ---
         st.subheader("Extracted Entities", divider="rainbow")
         properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
         for i, category_name in enumerate(category_names):
             with category_tabs[i]:
                 # Filter the main DataFrame for the current category
                 df_category_filtered = df[df['category'] == category_name]
         st.subheader("Tree map", divider="orange")
         # Update treemap path to include category
         fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
+                                    values='score', color='category') # Color by category for better visual distinction
         fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
         st.plotly_chart(fig_treemap)
         if comet_initialized:
         with col1:
             st.subheader("Pie Chart (by Entity Type)", divider="orange")
             fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
+                                 hover_data=['count'], labels={'count': 'count'}, title='Percentage of Predicted Labels (Entity Types)')
             fig_pie.update_traces(textposition='inside', textinfo='percent+label')
             st.plotly_chart(fig_pie)
             if comet_initialized:
         with col2:
             st.subheader("Bar Chart (by Entity Type)", divider="orange")
             fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
+                                 title='Occurrences of Predicted Labels (Entity Types)', orientation='h')
             fig_bar.update_layout(yaxis={'categoryorder':'total ascending'}) # Order bars
             st.plotly_chart(fig_bar)
             if comet_initialized:
         st.subheader("Entity Counts by Category", divider="orange")
         category_counts = df['category'].value_counts().reset_index().rename(columns={"index": "category", "count": "count"})
         fig_cat_bar = px.bar(category_counts, x="count", y="category", color="category", text_auto=True,
+                                 title='Occurrences of Entities by Category', orientation='h')
         fig_cat_bar.update_layout(yaxis={'categoryorder':'total ascending'})
         st.plotly_chart(fig_cat_bar)
         # --- Downloadable Content ---
         dfa = pd.DataFrame(
             data={