Maria Tsilimos
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -7,12 +7,13 @@ import zipfile
|
|
7 |
import os
|
8 |
import re
|
9 |
import numpy as np
|
|
|
10 |
|
11 |
from cryptography.fernet import Fernet
|
12 |
-
from gliner import GLiNER
|
13 |
-
from PyPDF2 import PdfReader
|
14 |
-
import docx
|
15 |
-
from comet_ml import Experiment
|
16 |
from streamlit_extras.stylable_container import stylable_container
|
17 |
|
18 |
st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
|
@@ -26,16 +27,73 @@ comet_initialized = False
|
|
26 |
if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
|
27 |
comet_initialized = True
|
28 |
|
29 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
if 'file_upload_attempts' not in st.session_state:
|
31 |
-
st.session_state['file_upload_attempts'] =
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
if 'encrypted_extracted_text' not in st.session_state:
|
34 |
st.session_state['encrypted_extracted_text'] = None
|
35 |
|
36 |
-
max_attempts = 10
|
37 |
-
|
38 |
-
|
39 |
GLINER_LABELS_CATEGORIZED = {
|
40 |
"Personal Identifiers": [
|
41 |
"Person",
|
@@ -67,9 +125,7 @@ GLINER_LABELS_CATEGORIZED = {
|
|
67 |
],
|
68 |
"Government & Official IDs": [
|
69 |
"Passport number",
|
70 |
-
|
71 |
"Social security number",
|
72 |
-
|
73 |
"CPF",
|
74 |
"Driver license number",
|
75 |
"Tax identification number",
|
@@ -172,10 +228,7 @@ def decrypt_text(encrypted_bytes: bytes) -> str | None:
|
|
172 |
st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
|
173 |
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**")
|
179 |
expander.write(f'''
|
180 |
**Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: "Person", "Organization", "Phone number", "Address", "Passport number", "Email", "Credit card number", "Social security number", "Health insurance ID number", "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF", "Driver license number", "Tax identification number", "Medical condition", "Identity card number", "National ID number", "IP address", "IBAN", "Credit card expiration date", "Username", "Health insurance number", "Registration number", "Student ID number", "Insurance number", "Flight number", "Landline phone number", "Blood type", "CVV", "Reservation number", "Digital signature", "Social media handle", "License plate number", "CNPJ", "Postal code", "Serial number", "Vehicle registration number", "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number", "Transaction number", "National health insurance number", "CVC", "Birth certificate number", "Train ticket number", "Passport expiration date"
|
181 |
|
@@ -185,7 +238,7 @@ expander.write(f'''
|
|
185 |
|
186 |
**How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button to extract and tag entities in your text data.
|
187 |
|
188 |
-
**Usage Limits:** You can request results up to
|
189 |
|
190 |
**Language settings:** Please check and adjust the language settings in your computer, so the French, German, Spanish, Portuguese and Italian characters are handled properly in your downloaded file.
|
191 |
|
@@ -195,28 +248,49 @@ expander.write(f'''
|
|
195 |
|
196 |
For any errors or inquiries, please contact us at info@nlpblogs.com
|
197 |
''')
|
198 |
-
|
199 |
|
200 |
-
|
201 |
-
|
202 |
with st.sidebar:
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
st.subheader("
|
208 |
-
st.
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
# --- File Upload (PDF/DOCX) ---
|
213 |
uploaded_file = st.file_uploader("Upload your file. Accepted file formats include: .pdf, .docx", type=['pdf', 'docx'])
|
214 |
|
215 |
-
# Initialize text for the current run outside the if uploaded_file block
|
216 |
current_run_text = None
|
217 |
|
218 |
if uploaded_file is not None:
|
219 |
file_extension = uploaded_file.name.split('.')[-1].lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
if file_extension == 'pdf':
|
221 |
try:
|
222 |
pdf_reader = PdfReader(uploaded_file)
|
@@ -270,7 +344,9 @@ if st.button("Results"):
|
|
270 |
st.warning("No extractable text content available for analysis. Please upload a valid PDF or DOCX file.")
|
271 |
st.stop()
|
272 |
|
|
|
273 |
st.session_state['file_upload_attempts'] += 1
|
|
|
274 |
|
275 |
with st.spinner("Analyzing text...", show_time=True):
|
276 |
model = load_ner_model()
|
@@ -301,7 +377,7 @@ if st.button("Results"):
|
|
301 |
# --- Add 'category' column to the DataFrame based on the grouped labels ---
|
302 |
df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
|
303 |
# Handle cases where an entity_group might not have a category (shouldn't happen if maps are complete)
|
304 |
-
df['category'] = df['category'].fillna('Uncategorized')
|
305 |
|
306 |
if comet_initialized:
|
307 |
experiment = Experiment(
|
@@ -313,7 +389,6 @@ if st.button("Results"):
|
|
313 |
experiment.log_table("predicted_entities", df)
|
314 |
experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
|
315 |
|
316 |
-
|
317 |
# --- Display Results ---
|
318 |
st.subheader("Extracted Entities", divider="rainbow")
|
319 |
properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
|
@@ -345,7 +420,6 @@ if st.button("Results"):
|
|
345 |
for i, category_name in enumerate(category_names):
|
346 |
with category_tabs[i]:
|
347 |
|
348 |
-
|
349 |
# Filter the main DataFrame for the current category
|
350 |
df_category_filtered = df[df['category'] == category_name]
|
351 |
|
@@ -367,7 +441,7 @@ if st.button("Results"):
|
|
367 |
st.subheader("Tree map", divider="orange")
|
368 |
# Update treemap path to include category
|
369 |
fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
|
370 |
-
|
371 |
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
|
372 |
st.plotly_chart(fig_treemap)
|
373 |
if comet_initialized:
|
@@ -380,7 +454,7 @@ if st.button("Results"):
|
|
380 |
with col1:
|
381 |
st.subheader("Pie Chart (by Entity Type)", divider="orange")
|
382 |
fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
|
383 |
-
|
384 |
fig_pie.update_traces(textposition='inside', textinfo='percent+label')
|
385 |
st.plotly_chart(fig_pie)
|
386 |
if comet_initialized:
|
@@ -389,7 +463,7 @@ if st.button("Results"):
|
|
389 |
with col2:
|
390 |
st.subheader("Bar Chart (by Entity Type)", divider="orange")
|
391 |
fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
|
392 |
-
|
393 |
fig_bar.update_layout(yaxis={'categoryorder':'total ascending'}) # Order bars
|
394 |
st.plotly_chart(fig_bar)
|
395 |
if comet_initialized:
|
@@ -399,11 +473,10 @@ if st.button("Results"):
|
|
399 |
st.subheader("Entity Counts by Category", divider="orange")
|
400 |
category_counts = df['category'].value_counts().reset_index().rename(columns={"index": "category", "count": "count"})
|
401 |
fig_cat_bar = px.bar(category_counts, x="count", y="category", color="category", text_auto=True,
|
402 |
-
|
403 |
fig_cat_bar.update_layout(yaxis={'categoryorder':'total ascending'})
|
404 |
st.plotly_chart(fig_cat_bar)
|
405 |
|
406 |
-
|
407 |
# --- Downloadable Content ---
|
408 |
dfa = pd.DataFrame(
|
409 |
data={
|
|
|
7 |
import os
|
8 |
import re
|
9 |
import numpy as np
|
10 |
+
import json # Added to handle persistent data
|
11 |
|
12 |
from cryptography.fernet import Fernet
|
13 |
+
from gliner import GLiNER
|
14 |
+
from PyPDF2 import PdfReader
|
15 |
+
import docx
|
16 |
+
from comet_ml import Experiment
|
17 |
from streamlit_extras.stylable_container import stylable_container
|
18 |
|
19 |
st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
|
|
|
27 |
if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
|
28 |
comet_initialized = True
|
29 |
|
30 |
+
# --- Persistent Counter and History Configuration ---
|
31 |
+
COUNTER_FILE = "counter_ner_app.json"
|
32 |
+
HISTORY_FILE = "file_history_ner_app.json"
|
33 |
+
max_attempts = 300
|
34 |
+
|
35 |
+
# --- Functions to manage persistent data ---
|
36 |
+
def load_attempts():
|
37 |
+
"""
|
38 |
+
Loads the attempts count from a persistent JSON file.
|
39 |
+
Returns 0 if the file doesn't exist or is invalid.
|
40 |
+
"""
|
41 |
+
if os.path.exists(COUNTER_FILE):
|
42 |
+
try:
|
43 |
+
with open(COUNTER_FILE, "r") as f:
|
44 |
+
data = json.load(f)
|
45 |
+
return data.get('file_upload_attempts', 0)
|
46 |
+
except (json.JSONDecodeError, KeyError):
|
47 |
+
return 0
|
48 |
+
return 0
|
49 |
+
|
50 |
+
def save_attempts(attempts):
|
51 |
+
"""
|
52 |
+
Saves the current attempts count to the persistent JSON file.
|
53 |
+
"""
|
54 |
+
with open(COUNTER_FILE, "w") as f:
|
55 |
+
json.dump({'file_upload_attempts': attempts}, f)
|
56 |
+
|
57 |
+
def load_history():
|
58 |
+
"""
|
59 |
+
Loads the file upload history from a persistent JSON file.
|
60 |
+
Returns an empty list if the file doesn't exist or is invalid.
|
61 |
+
"""
|
62 |
+
if os.path.exists(HISTORY_FILE):
|
63 |
+
try:
|
64 |
+
with open(HISTORY_FILE, "r") as f:
|
65 |
+
data = json.load(f)
|
66 |
+
return data.get('uploaded_files', [])
|
67 |
+
except (json.JSONDecodeError, KeyError):
|
68 |
+
return []
|
69 |
+
return []
|
70 |
+
|
71 |
+
def save_history(history):
|
72 |
+
"""
|
73 |
+
Saves the current file upload history to the persistent JSON file.
|
74 |
+
"""
|
75 |
+
with open(HISTORY_FILE, "w") as f:
|
76 |
+
json.dump({'uploaded_files': history}, f)
|
77 |
+
|
78 |
+
def clear_history_data():
|
79 |
+
"""Clears the file history from session state and deletes the persistent file."""
|
80 |
+
if os.path.exists(HISTORY_FILE):
|
81 |
+
os.remove(HISTORY_FILE)
|
82 |
+
st.session_state['uploaded_files_history'] = []
|
83 |
+
st.rerun()
|
84 |
+
|
85 |
+
# --- Initialize session state with persistent data ---
|
86 |
if 'file_upload_attempts' not in st.session_state:
|
87 |
+
st.session_state['file_upload_attempts'] = load_attempts()
|
88 |
+
save_attempts(st.session_state['file_upload_attempts'])
|
89 |
+
|
90 |
+
if 'uploaded_files_history' not in st.session_state:
|
91 |
+
st.session_state['uploaded_files_history'] = load_history()
|
92 |
+
save_history(st.session_state['uploaded_files_history'])
|
93 |
|
94 |
if 'encrypted_extracted_text' not in st.session_state:
|
95 |
st.session_state['encrypted_extracted_text'] = None
|
96 |
|
|
|
|
|
|
|
97 |
GLINER_LABELS_CATEGORIZED = {
|
98 |
"Personal Identifiers": [
|
99 |
"Person",
|
|
|
125 |
],
|
126 |
"Government & Official IDs": [
|
127 |
"Passport number",
|
|
|
128 |
"Social security number",
|
|
|
129 |
"CPF",
|
130 |
"Driver license number",
|
131 |
"Tax identification number",
|
|
|
228 |
st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
|
229 |
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
|
230 |
|
231 |
+
expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**")
|
|
|
|
|
|
|
232 |
expander.write(f'''
|
233 |
**Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: "Person", "Organization", "Phone number", "Address", "Passport number", "Email", "Credit card number", "Social security number", "Health insurance ID number", "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF", "Driver license number", "Tax identification number", "Medical condition", "Identity card number", "National ID number", "IP address", "IBAN", "Credit card expiration date", "Username", "Health insurance number", "Registration number", "Student ID number", "Insurance number", "Flight number", "Landline phone number", "Blood type", "CVV", "Reservation number", "Digital signature", "Social media handle", "License plate number", "CNPJ", "Postal code", "Serial number", "Vehicle registration number", "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number", "Transaction number", "National health insurance number", "CVC", "Birth certificate number", "Train ticket number", "Passport expiration date"
|
234 |
|
|
|
238 |
|
239 |
**How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button to extract and tag entities in your text data.
|
240 |
|
241 |
+
**Usage Limits:** You can request results up to 300 requests within a 30-day period.
|
242 |
|
243 |
**Language settings:** Please check and adjust the language settings in your computer, so the French, German, Spanish, Portuguese and Italian characters are handled properly in your downloaded file.
|
244 |
|
|
|
248 |
|
249 |
For any errors or inquiries, please contact us at info@nlpblogs.com
|
250 |
''')
|
|
|
251 |
|
|
|
|
|
252 |
with st.sidebar:
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
# --- Added Persistent History Display ---
|
257 |
+
st.subheader("Your File Upload History", divider="orange")
|
258 |
+
if st.session_state['uploaded_files_history']:
|
259 |
+
history_df = pd.DataFrame(st.session_state['uploaded_files_history'])
|
260 |
+
st.dataframe(history_df, use_container_width=True, hide_index=True)
|
261 |
+
# Add a clear history button
|
262 |
+
if st.button("Clear File History", help="This will permanently delete the file history from the application."):
|
263 |
+
clear_history_data()
|
264 |
+
else:
|
265 |
+
st.info("You have not uploaded any files yet.")
|
266 |
+
|
267 |
+
st.subheader("Build your own NER Web App in a minute without writing a single line of code.", divider="orange")
|
268 |
+
st.link_button("NER File Builder",
|
269 |
+
"https://nlpblogs.com/shop/named-entity-recognition-ner/ner-file-builder/",
|
270 |
+
type="primary")
|
271 |
|
272 |
# --- File Upload (PDF/DOCX) ---
|
273 |
uploaded_file = st.file_uploader("Upload your file. Accepted file formats include: .pdf, .docx", type=['pdf', 'docx'])
|
274 |
|
|
|
275 |
current_run_text = None
|
276 |
|
277 |
if uploaded_file is not None:
|
278 |
file_extension = uploaded_file.name.split('.')[-1].lower()
|
279 |
+
|
280 |
+
# Check if this file has already been processed and is the same as the last one
|
281 |
+
# This prevents re-adding the same file to history on every rerun of the app
|
282 |
+
if st.session_state['uploaded_files_history'] and uploaded_file.name == st.session_state['uploaded_files_history'][-1]['filename']:
|
283 |
+
# Do not re-add to history, just process the file
|
284 |
+
pass
|
285 |
+
else:
|
286 |
+
# --- ADDING TO UPLOAD HISTORY ---
|
287 |
+
new_upload_entry = {
|
288 |
+
"filename": uploaded_file.name,
|
289 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
290 |
+
}
|
291 |
+
st.session_state['uploaded_files_history'].append(new_upload_entry)
|
292 |
+
save_history(st.session_state['uploaded_files_history'])
|
293 |
+
|
294 |
if file_extension == 'pdf':
|
295 |
try:
|
296 |
pdf_reader = PdfReader(uploaded_file)
|
|
|
344 |
st.warning("No extractable text content available for analysis. Please upload a valid PDF or DOCX file.")
|
345 |
st.stop()
|
346 |
|
347 |
+
# Increment and save the attempts counter
|
348 |
st.session_state['file_upload_attempts'] += 1
|
349 |
+
save_attempts(st.session_state['file_upload_attempts'])
|
350 |
|
351 |
with st.spinner("Analyzing text...", show_time=True):
|
352 |
model = load_ner_model()
|
|
|
377 |
# --- Add 'category' column to the DataFrame based on the grouped labels ---
|
378 |
df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
|
379 |
# Handle cases where an entity_group might not have a category (shouldn't happen if maps are complete)
|
380 |
+
df['category'] = df['category'].fillna('Uncategorized')
|
381 |
|
382 |
if comet_initialized:
|
383 |
experiment = Experiment(
|
|
|
389 |
experiment.log_table("predicted_entities", df)
|
390 |
experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
|
391 |
|
|
|
392 |
# --- Display Results ---
|
393 |
st.subheader("Extracted Entities", divider="rainbow")
|
394 |
properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
|
|
|
420 |
for i, category_name in enumerate(category_names):
|
421 |
with category_tabs[i]:
|
422 |
|
|
|
423 |
# Filter the main DataFrame for the current category
|
424 |
df_category_filtered = df[df['category'] == category_name]
|
425 |
|
|
|
441 |
st.subheader("Tree map", divider="orange")
|
442 |
# Update treemap path to include category
|
443 |
fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
|
444 |
+
values='score', color='category') # Color by category for better visual distinction
|
445 |
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
|
446 |
st.plotly_chart(fig_treemap)
|
447 |
if comet_initialized:
|
|
|
454 |
with col1:
|
455 |
st.subheader("Pie Chart (by Entity Type)", divider="orange")
|
456 |
fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
|
457 |
+
hover_data=['count'], labels={'count': 'count'}, title='Percentage of Predicted Labels (Entity Types)')
|
458 |
fig_pie.update_traces(textposition='inside', textinfo='percent+label')
|
459 |
st.plotly_chart(fig_pie)
|
460 |
if comet_initialized:
|
|
|
463 |
with col2:
|
464 |
st.subheader("Bar Chart (by Entity Type)", divider="orange")
|
465 |
fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
|
466 |
+
title='Occurrences of Predicted Labels (Entity Types)', orientation='h')
|
467 |
fig_bar.update_layout(yaxis={'categoryorder':'total ascending'}) # Order bars
|
468 |
st.plotly_chart(fig_bar)
|
469 |
if comet_initialized:
|
|
|
473 |
st.subheader("Entity Counts by Category", divider="orange")
|
474 |
category_counts = df['category'].value_counts().reset_index().rename(columns={"index": "category", "count": "count"})
|
475 |
fig_cat_bar = px.bar(category_counts, x="count", y="category", color="category", text_auto=True,
|
476 |
+
title='Occurrences of Entities by Category', orientation='h')
|
477 |
fig_cat_bar.update_layout(yaxis={'categoryorder':'total ascending'})
|
478 |
st.plotly_chart(fig_cat_bar)
|
479 |
|
|
|
480 |
# --- Downloadable Content ---
|
481 |
dfa = pd.DataFrame(
|
482 |
data={
|