Maria Tsilimos commited on
Commit
e47c2d2
·
unverified ·
1 Parent(s): a3633fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -36
app.py CHANGED
@@ -7,12 +7,13 @@ import zipfile
7
  import os
8
  import re
9
  import numpy as np
 
10
 
11
  from cryptography.fernet import Fernet
12
- from gliner import GLiNER
13
- from PyPDF2 import PdfReader
14
- import docx
15
- from comet_ml import Experiment
16
  from streamlit_extras.stylable_container import stylable_container
17
 
18
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
@@ -26,16 +27,73 @@ comet_initialized = False
26
  if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
27
  comet_initialized = True
28
 
29
- # --- Initialize session state ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  if 'file_upload_attempts' not in st.session_state:
31
- st.session_state['file_upload_attempts'] = 0
 
 
 
 
 
32
 
33
  if 'encrypted_extracted_text' not in st.session_state:
34
  st.session_state['encrypted_extracted_text'] = None
35
 
36
- max_attempts = 10
37
-
38
-
39
  GLINER_LABELS_CATEGORIZED = {
40
  "Personal Identifiers": [
41
  "Person",
@@ -67,9 +125,7 @@ GLINER_LABELS_CATEGORIZED = {
67
  ],
68
  "Government & Official IDs": [
69
  "Passport number",
70
-
71
  "Social security number",
72
-
73
  "CPF",
74
  "Driver license number",
75
  "Tax identification number",
@@ -172,10 +228,7 @@ def decrypt_text(encrypted_bytes: bytes) -> str | None:
172
  st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
173
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
174
 
175
-
176
-
177
-
178
- expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**")
179
  expander.write(f'''
180
  **Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: "Person", "Organization", "Phone number", "Address", "Passport number", "Email", "Credit card number", "Social security number", "Health insurance ID number", "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF", "Driver license number", "Tax identification number", "Medical condition", "Identity card number", "National ID number", "IP address", "IBAN", "Credit card expiration date", "Username", "Health insurance number", "Registration number", "Student ID number", "Insurance number", "Flight number", "Landline phone number", "Blood type", "CVV", "Reservation number", "Digital signature", "Social media handle", "License plate number", "CNPJ", "Postal code", "Serial number", "Vehicle registration number", "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number", "Transaction number", "National health insurance number", "CVC", "Birth certificate number", "Train ticket number", "Passport expiration date"
181
 
@@ -185,7 +238,7 @@ expander.write(f'''
185
 
186
  **How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button to extract and tag entities in your text data.
187
 
188
- **Usage Limits:** You can request results up to 10 times.
189
 
190
  **Language settings:** Please check and adjust the language settings in your computer, so the French, German, Spanish, Portuguese and Italian characters are handled properly in your downloaded file.
191
 
@@ -195,28 +248,49 @@ expander.write(f'''
195
 
196
  For any errors or inquiries, please contact us at info@nlpblogs.com
197
  ''')
198
-
199
 
200
-
201
-
202
  with st.sidebar:
203
- container = st.container(border=True)
204
- container.write("**Named Entity Recognition (NER)** is the task of "
205
- "extracting and tagging entities in text data. Entities can be persons, "
206
- "organizations, locations, countries, products, events etc.")
207
- st.subheader("Related NER Web Apps", divider="orange")
208
- st.link_button("Scandinavian JSON Entity Finder",
209
- "https://nlpblogs.com/shop/named-entity-recognition-ner/scandinavian-json-entity-finder/",
210
- type="primary")
 
 
 
 
 
 
 
 
 
 
211
 
212
  # --- File Upload (PDF/DOCX) ---
213
  uploaded_file = st.file_uploader("Upload your file. Accepted file formats include: .pdf, .docx", type=['pdf', 'docx'])
214
 
215
- # Initialize text for the current run outside the if uploaded_file block
216
  current_run_text = None
217
 
218
  if uploaded_file is not None:
219
  file_extension = uploaded_file.name.split('.')[-1].lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  if file_extension == 'pdf':
221
  try:
222
  pdf_reader = PdfReader(uploaded_file)
@@ -270,7 +344,9 @@ if st.button("Results"):
270
  st.warning("No extractable text content available for analysis. Please upload a valid PDF or DOCX file.")
271
  st.stop()
272
 
 
273
  st.session_state['file_upload_attempts'] += 1
 
274
 
275
  with st.spinner("Analyzing text...", show_time=True):
276
  model = load_ner_model()
@@ -301,7 +377,7 @@ if st.button("Results"):
301
  # --- Add 'category' column to the DataFrame based on the grouped labels ---
302
  df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
303
  # Handle cases where an entity_group might not have a category (shouldn't happen if maps are complete)
304
- df['category'] = df['category'].fillna('Uncategorized')
305
 
306
  if comet_initialized:
307
  experiment = Experiment(
@@ -313,7 +389,6 @@ if st.button("Results"):
313
  experiment.log_table("predicted_entities", df)
314
  experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
315
 
316
-
317
  # --- Display Results ---
318
  st.subheader("Extracted Entities", divider="rainbow")
319
  properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
@@ -345,7 +420,6 @@ if st.button("Results"):
345
  for i, category_name in enumerate(category_names):
346
  with category_tabs[i]:
347
 
348
-
349
  # Filter the main DataFrame for the current category
350
  df_category_filtered = df[df['category'] == category_name]
351
 
@@ -367,7 +441,7 @@ if st.button("Results"):
367
  st.subheader("Tree map", divider="orange")
368
  # Update treemap path to include category
369
  fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
370
- values='score', color='category') # Color by category for better visual distinction
371
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
372
  st.plotly_chart(fig_treemap)
373
  if comet_initialized:
@@ -380,7 +454,7 @@ if st.button("Results"):
380
  with col1:
381
  st.subheader("Pie Chart (by Entity Type)", divider="orange")
382
  fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
383
- hover_data=['count'], labels={'count': 'count'}, title='Percentage of Predicted Labels (Entity Types)')
384
  fig_pie.update_traces(textposition='inside', textinfo='percent+label')
385
  st.plotly_chart(fig_pie)
386
  if comet_initialized:
@@ -389,7 +463,7 @@ if st.button("Results"):
389
  with col2:
390
  st.subheader("Bar Chart (by Entity Type)", divider="orange")
391
  fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
392
- title='Occurrences of Predicted Labels (Entity Types)', orientation='h')
393
  fig_bar.update_layout(yaxis={'categoryorder':'total ascending'}) # Order bars
394
  st.plotly_chart(fig_bar)
395
  if comet_initialized:
@@ -399,11 +473,10 @@ if st.button("Results"):
399
  st.subheader("Entity Counts by Category", divider="orange")
400
  category_counts = df['category'].value_counts().reset_index().rename(columns={"index": "category", "count": "count"})
401
  fig_cat_bar = px.bar(category_counts, x="count", y="category", color="category", text_auto=True,
402
- title='Occurrences of Entities by Category', orientation='h')
403
  fig_cat_bar.update_layout(yaxis={'categoryorder':'total ascending'})
404
  st.plotly_chart(fig_cat_bar)
405
 
406
-
407
  # --- Downloadable Content ---
408
  dfa = pd.DataFrame(
409
  data={
 
7
  import os
8
  import re
9
  import numpy as np
10
+ import json # Added to handle persistent data
11
 
12
  from cryptography.fernet import Fernet
13
+ from gliner import GLiNER
14
+ from PyPDF2 import PdfReader
15
+ import docx
16
+ from comet_ml import Experiment
17
  from streamlit_extras.stylable_container import stylable_container
18
 
19
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
 
27
  if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
28
  comet_initialized = True
29
 
30
+ # --- Persistent Counter and History Configuration ---
31
+ COUNTER_FILE = "counter_ner_app.json"
32
+ HISTORY_FILE = "file_history_ner_app.json"
33
+ max_attempts = 300
34
+
35
+ # --- Functions to manage persistent data ---
36
+ def load_attempts():
37
+ """
38
+ Loads the attempts count from a persistent JSON file.
39
+ Returns 0 if the file doesn't exist or is invalid.
40
+ """
41
+ if os.path.exists(COUNTER_FILE):
42
+ try:
43
+ with open(COUNTER_FILE, "r") as f:
44
+ data = json.load(f)
45
+ return data.get('file_upload_attempts', 0)
46
+ except (json.JSONDecodeError, KeyError):
47
+ return 0
48
+ return 0
49
+
50
+ def save_attempts(attempts):
51
+ """
52
+ Saves the current attempts count to the persistent JSON file.
53
+ """
54
+ with open(COUNTER_FILE, "w") as f:
55
+ json.dump({'file_upload_attempts': attempts}, f)
56
+
57
+ def load_history():
58
+ """
59
+ Loads the file upload history from a persistent JSON file.
60
+ Returns an empty list if the file doesn't exist or is invalid.
61
+ """
62
+ if os.path.exists(HISTORY_FILE):
63
+ try:
64
+ with open(HISTORY_FILE, "r") as f:
65
+ data = json.load(f)
66
+ return data.get('uploaded_files', [])
67
+ except (json.JSONDecodeError, KeyError):
68
+ return []
69
+ return []
70
+
71
+ def save_history(history):
72
+ """
73
+ Saves the current file upload history to the persistent JSON file.
74
+ """
75
+ with open(HISTORY_FILE, "w") as f:
76
+ json.dump({'uploaded_files': history}, f)
77
+
78
+ def clear_history_data():
79
+ """Clears the file history from session state and deletes the persistent file."""
80
+ if os.path.exists(HISTORY_FILE):
81
+ os.remove(HISTORY_FILE)
82
+ st.session_state['uploaded_files_history'] = []
83
+ st.rerun()
84
+
85
+ # --- Initialize session state with persistent data ---
86
  if 'file_upload_attempts' not in st.session_state:
87
+ st.session_state['file_upload_attempts'] = load_attempts()
88
+ save_attempts(st.session_state['file_upload_attempts'])
89
+
90
+ if 'uploaded_files_history' not in st.session_state:
91
+ st.session_state['uploaded_files_history'] = load_history()
92
+ save_history(st.session_state['uploaded_files_history'])
93
 
94
  if 'encrypted_extracted_text' not in st.session_state:
95
  st.session_state['encrypted_extracted_text'] = None
96
 
 
 
 
97
  GLINER_LABELS_CATEGORIZED = {
98
  "Personal Identifiers": [
99
  "Person",
 
125
  ],
126
  "Government & Official IDs": [
127
  "Passport number",
 
128
  "Social security number",
 
129
  "CPF",
130
  "Driver license number",
131
  "Tax identification number",
 
228
  st.subheader("Multilingual PDF & DOCX Entity Finder", divider="orange") # Updated title
229
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
230
 
231
+ expander = st.expander("**Important notes on the Multilingual PDF & DOCX Entity Finder**")
 
 
 
232
  expander.write(f'''
233
  **Named Entities:** This Multilingual PDF & DOCX Entity Finder predicts a wide range of custom labels, including: "Person", "Organization", "Phone number", "Address", "Passport number", "Email", "Credit card number", "Social security number", "Health insurance ID number", "Date of birth", "Mobile phone number", "Bank account number", "Medication", "CPF", "Driver license number", "Tax identification number", "Medical condition", "Identity card number", "National ID number", "IP address", "IBAN", "Credit card expiration date", "Username", "Health insurance number", "Registration number", "Student ID number", "Insurance number", "Flight number", "Landline phone number", "Blood type", "CVV", "Reservation number", "Digital signature", "Social media handle", "License plate number", "CNPJ", "Postal code", "Serial number", "Vehicle registration number", "Credit card brand", "Fax number", "Visa number", "Insurance company", "Identity document number", "Transaction number", "National health insurance number", "CVC", "Birth certificate number", "Train ticket number", "Passport expiration date"
234
 
 
238
 
239
  **How to Use:** Upload your PDF or DOCX file. Then, click the 'Results' button to extract and tag entities in your text data.
240
 
241
+ **Usage Limits:** You can request results up to 300 requests within a 30-day period.
242
 
243
  **Language settings:** Please check and adjust the language settings in your computer, so the French, German, Spanish, Portuguese and Italian characters are handled properly in your downloaded file.
244
 
 
248
 
249
  For any errors or inquiries, please contact us at info@nlpblogs.com
250
  ''')
 
251
 
 
 
252
  with st.sidebar:
253
+
254
+
255
+
256
+ # --- Added Persistent History Display ---
257
+ st.subheader("Your File Upload History", divider="orange")
258
+ if st.session_state['uploaded_files_history']:
259
+ history_df = pd.DataFrame(st.session_state['uploaded_files_history'])
260
+ st.dataframe(history_df, use_container_width=True, hide_index=True)
261
+ # Add a clear history button
262
+ if st.button("Clear File History", help="This will permanently delete the file history from the application."):
263
+ clear_history_data()
264
+ else:
265
+ st.info("You have not uploaded any files yet.")
266
+
267
+ st.subheader("Build your own NER Web App in a minute without writing a single line of code.", divider="orange")
268
+ st.link_button("NER File Builder",
269
+ "https://nlpblogs.com/shop/named-entity-recognition-ner/ner-file-builder/",
270
+ type="primary")
271
 
272
  # --- File Upload (PDF/DOCX) ---
273
  uploaded_file = st.file_uploader("Upload your file. Accepted file formats include: .pdf, .docx", type=['pdf', 'docx'])
274
 
 
275
  current_run_text = None
276
 
277
  if uploaded_file is not None:
278
  file_extension = uploaded_file.name.split('.')[-1].lower()
279
+
280
+ # Check if this file has already been processed and is the same as the last one
281
+ # This prevents re-adding the same file to history on every rerun of the app
282
+ if st.session_state['uploaded_files_history'] and uploaded_file.name == st.session_state['uploaded_files_history'][-1]['filename']:
283
+ # Do not re-add to history, just process the file
284
+ pass
285
+ else:
286
+ # --- ADDING TO UPLOAD HISTORY ---
287
+ new_upload_entry = {
288
+ "filename": uploaded_file.name,
289
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
290
+ }
291
+ st.session_state['uploaded_files_history'].append(new_upload_entry)
292
+ save_history(st.session_state['uploaded_files_history'])
293
+
294
  if file_extension == 'pdf':
295
  try:
296
  pdf_reader = PdfReader(uploaded_file)
 
344
  st.warning("No extractable text content available for analysis. Please upload a valid PDF or DOCX file.")
345
  st.stop()
346
 
347
+ # Increment and save the attempts counter
348
  st.session_state['file_upload_attempts'] += 1
349
+ save_attempts(st.session_state['file_upload_attempts'])
350
 
351
  with st.spinner("Analyzing text...", show_time=True):
352
  model = load_ner_model()
 
377
  # --- Add 'category' column to the DataFrame based on the grouped labels ---
378
  df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
379
  # Handle cases where an entity_group might not have a category (shouldn't happen if maps are complete)
380
+ df['category'] = df['category'].fillna('Uncategorized')
381
 
382
  if comet_initialized:
383
  experiment = Experiment(
 
389
  experiment.log_table("predicted_entities", df)
390
  experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
391
 
 
392
  # --- Display Results ---
393
  st.subheader("Extracted Entities", divider="rainbow")
394
  properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
 
420
  for i, category_name in enumerate(category_names):
421
  with category_tabs[i]:
422
 
 
423
  # Filter the main DataFrame for the current category
424
  df_category_filtered = df[df['category'] == category_name]
425
 
 
441
  st.subheader("Tree map", divider="orange")
442
  # Update treemap path to include category
443
  fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
444
+ values='score', color='category') # Color by category for better visual distinction
445
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
446
  st.plotly_chart(fig_treemap)
447
  if comet_initialized:
 
454
  with col1:
455
  st.subheader("Pie Chart (by Entity Type)", divider="orange")
456
  fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
457
+ hover_data=['count'], labels={'count': 'count'}, title='Percentage of Predicted Labels (Entity Types)')
458
  fig_pie.update_traces(textposition='inside', textinfo='percent+label')
459
  st.plotly_chart(fig_pie)
460
  if comet_initialized:
 
463
  with col2:
464
  st.subheader("Bar Chart (by Entity Type)", divider="orange")
465
  fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
466
+ title='Occurrences of Predicted Labels (Entity Types)', orientation='h')
467
  fig_bar.update_layout(yaxis={'categoryorder':'total ascending'}) # Order bars
468
  st.plotly_chart(fig_bar)
469
  if comet_initialized:
 
473
  st.subheader("Entity Counts by Category", divider="orange")
474
  category_counts = df['category'].value_counts().reset_index().rename(columns={"index": "category", "count": "count"})
475
  fig_cat_bar = px.bar(category_counts, x="count", y="category", color="category", text_auto=True,
476
+ title='Occurrences of Entities by Category', orientation='h')
477
  fig_cat_bar.update_layout(yaxis={'categoryorder':'total ascending'})
478
  st.plotly_chart(fig_cat_bar)
479
 
 
480
  # --- Downloadable Content ---
481
  dfa = pd.DataFrame(
482
  data={