Maria Tsilimos commited on
Commit
099d76a
·
unverified ·
1 Parent(s): 029d585

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +313 -0
app.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import streamlit as st
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ from transformers import pipeline
6
+ import plotly.express as px
7
+ import time
8
+ import io
9
+ import os
10
+ from comet_ml import Experiment
11
+ import zipfile
12
+ import re
13
+ from streamlit_extras.stylable_container import stylable_container
14
+ import numpy as np
15
+
16
+
17
+ st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
18
+
19
+
20
+
21
+
22
+ COMET_API_KEY = os.environ.get("COMET_API_KEY")
23
+ COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
24
+ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
25
+
26
+ comet_initialized = False
27
+ if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
28
+ comet_initialized = True
29
+
30
+
31
+
32
+ st.subheader("18-Chinese Named Entity Recognition Web App", divider="rainbow")
33
+ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
34
+
35
+ expander = st.expander("**Important notes on the 18-Chinese Named Entity Recognition Web App**")
36
+ expander.write('''
37
+ **Named Entities:** This 18-Chinese Named Entity Recognition Web App predicts eighteen (18) labels ("**CARDINAL**: cardinal number”, “**DATE**: date”, “**EVENT**: event name”, “**FAC**: facilities”, “**GPE**: geopolitical entity”, "**LANGUAGE**: language", "**LAW**: law", "**LOC**: location", "**MONEY**: money", "**NORP**: ethnic, religious, political groups", "**ORDINAL**: ordinal number", "**ORG**: organization", "**PERCENT**: percent value", "**PERSON**: person", "**PRODUCT**: product", "**QUANTITY**: quantity", "**TIME**: time", "**WORK_OF_ART**: work of art"). Results are presented in an easy-to-read table, visualized in an interactive tree map, pie chart, and bar chart, and are available for download along with a Glossary of tags.
38
+
39
+ **How to Use:** Paste a URL, and then press Enter. If you type or paste text, just press Ctrl + Enter.
40
+
41
+ **Usage Limits:** You can request results up to 10 times.
42
+
43
+ **Customization:** To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts.
44
+
45
+ **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
46
+
47
+ For any errors or inquiries, please contact us at info@nlpblogs.com
48
+ ''')
49
+
50
+
51
+
52
+ with st.sidebar:
53
+ container = st.container(border=True)
54
+ container.write("**Named Entity Recognition (NER)** is the task of extracting and tagging entities in text data. Entities can be persons, organizations, locations, countries, products, events etc.")
55
+ st.subheader("Related NLP Web Apps", divider="rainbow")
56
+ st.link_button("58-Italian-Named-Entity-Recognition-PDF-DOCX-Web App", "https://nlpblogs.com/shop/named-entity-recognition-ner/58-italian-named-entity-recognition-web-app/", type = "primary")
57
+
58
+
59
+ if 'source_type_attempts' not in st.session_state:
60
+ st.session_state['source_type_attempts'] = 0
61
+ max_attempts = 10
62
+
63
+ def clear_url_input():
64
+
65
+ st.session_state.url = ""
66
+
67
+ def clear_text_input():
68
+
69
+ st.session_state.my_text_area = ""
70
+
71
+ url = st.text_input("Enter URL from the internet, and then press Enter:", key="url")
72
+ st.button("Clear URL", on_click=clear_url_input)
73
+
74
+ text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", key='my_text_area')
75
+ st.button("Clear Text", on_click=clear_text_input)
76
+
77
+
78
+ source_type = None
79
+ input_content = None
80
+ text_to_process = None
81
+
82
+ if url:
83
+ source_type = 'url'
84
+ input_content = url
85
+ elif text:
86
+ source_type = 'text'
87
+ input_content = text
88
+
89
+ if source_type:
90
+
91
+ st.subheader("Results", divider = "rainbow")
92
+
93
+
94
+ if st.session_state['source_type_attempts'] >= max_attempts:
95
+ st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
96
+ st.stop()
97
+
98
+ st.session_state['source_type_attempts'] += 1
99
+
100
+
101
+ @st.cache_resource
102
+ def load_ner_model():
103
+
104
+ return pipeline("token-classification", model="ckiplab/albert-tiny-chinese-ner", aggregation_strategy="max")
105
+
106
+ model = load_ner_model()
107
+ experiment = None
108
+
109
+ try:
110
+ if source_type == 'url':
111
+ if not url.startswith(("http://", "https://")):
112
+ st.error("Please enter a valid URL starting with 'http://' or 'https://'.")
113
+ else:
114
+ with st.spinner(f"Fetching and parsing content from **{url}**...", show_time=True):
115
+ f = requests.get(url, timeout=10)
116
+ f.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
117
+ soup = BeautifulSoup(f.text, 'html.parser')
118
+ text_to_process = soup.get_text(separator=' ', strip=True)
119
+ st.divider()
120
+ st.write("**Input text content**")
121
+ st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process)
122
+
123
+
124
+
125
+ elif source_type == 'text':
126
+ text_to_process = text
127
+ st.divider()
128
+ st.write("**Input text content**")
129
+
130
+ st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process)
131
+
132
+ if text_to_process and len(text_to_process.strip()) > 0:
133
+ with st.spinner("Analyzing text...", show_time=True):
134
+ entities = model(text_to_process)
135
+ data = []
136
+ for entity in entities:
137
+ data.append({
138
+ 'word': entity['word'],
139
+ 'entity_group': entity['entity_group'],
140
+ 'score': entity['score'],
141
+ 'start': entity['start'], # Include start and end for download
142
+ 'end': entity['end']
143
+ })
144
+ df = pd.DataFrame(data)
145
+
146
+
147
+ pattern = r'[^\w\s]'
148
+ df['word'] = df['word'].replace(pattern, '', regex=True)
149
+
150
+ df = df.replace('', 'Unknown')
151
+ st.dataframe(df)
152
+
153
+
154
+ if comet_initialized:
155
+ experiment = Experiment(
156
+ api_key=COMET_API_KEY,
157
+ workspace=COMET_WORKSPACE,
158
+ project_name=COMET_PROJECT_NAME,
159
+ )
160
+ experiment.log_parameter("input_source_type", source_type)
161
+ experiment.log_parameter("input_content_length", len(input_content))
162
+ experiment.log_table("predicted_entities", df)
163
+
164
+ with st.expander("See Glossary of tags"):
165
+ st.write('''
166
+ '**word**': ['entity extracted from your text data']
167
+
168
+ '**score**': ['accuracy score; how accurately a tag has been assigned to a given entity']
169
+
170
+ '**entity_group**': ['label (tag) assigned to a given extracted entity']
171
+
172
+ '**start**': ['index of the start of the corresponding entity']
173
+
174
+ '**end**': ['index of the end of the corresponding entity']
175
+ ''')
176
+
177
+ entity_groups = {"CARDINAL": "cardinal number",
178
+ "DATE": "date",
179
+ "EVENT": "event name",
180
+ "FAC": "facilities",
181
+ "GPE": "geopolitical entity",
182
+ "LANGUAGE": "language",
183
+ "LAW": "law",
184
+ "LOC": "location",
185
+ "MONEY": "money",
186
+ "NORP": "ethnic, religious, political groups",
187
+ "ORDINAL": "ordinal number",
188
+ "ORG": "organization",
189
+ "PERCENT": "percent value",
190
+ "PERSON": "person",
191
+ "PRODUCT": "product",
192
+ "QUANTITY": "quantity",
193
+ "TIME": "time",
194
+ "WORK_OF_ART": "work of art",
195
+
196
+
197
+
198
+
199
+
200
+ }
201
+
202
+
203
+
204
+ st.subheader("Grouped entities", divider = "rainbow")
205
+
206
+
207
+ # Convert entity_groups dictionary to a list of (key, title) tuples
208
+ entity_items = list(entity_groups.items())
209
+ # Define how many tabs per row
210
+ tabs_per_row = 5
211
+ for i in range(0, len(entity_items), tabs_per_row):
212
+ current_row_entities = entity_items[i : i + tabs_per_row]
213
+ tab_titles = [item[1] for item in current_row_entities]
214
+ tabs = st.tabs(tab_titles)
215
+ for j, (entity_group_key, tab_title) in enumerate(current_row_entities):
216
+ with tabs[j]:
217
+ if entity_group_key in df["entity_group"].unique():
218
+ df_filtered = df[df["entity_group"] == entity_group_key]
219
+ st.dataframe(df_filtered, use_container_width=True)
220
+ else:
221
+ st.info(f"No '{tab_title}' entities found in the text.")
222
+ st.dataframe(pd.DataFrame({
223
+ 'entity_group': [entity_group_key],
224
+ 'score': [np.nan],
225
+ 'word': [np.nan],
226
+ 'start': [np.nan],
227
+ 'end': [np.nan]
228
+ }), hide_index=True)
229
+
230
+ st.divider()
231
+
232
+
233
+
234
+
235
+
236
+ if not df.empty:
237
+
238
+ st.markdown("---")
239
+ st.subheader("Treemap", divider="rainbow")
240
+ fig = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'],
241
+ values='score', color='entity_group',
242
+ )
243
+ fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
244
+ st.plotly_chart(fig, use_container_width=True)
245
+ if comet_initialized and experiment:
246
+ experiment.log_figure(figure=fig, figure_name="entity_treemap")
247
+
248
+
249
+
250
+ value_counts = df['entity_group'].value_counts().reset_index()
251
+ value_counts.columns = ['entity_group', 'count']
252
+
253
+ col1, col2 = st.columns(2)
254
+ with col1:
255
+ st.subheader("Pie Chart", divider="rainbow")
256
+ fig1 = px.pie(value_counts, values='count', names='entity_group',
257
+ hover_data=['count'], labels={'count': 'count'},
258
+ title='Percentage of Predicted Labels')
259
+ fig1.update_traces(textposition='inside', textinfo='percent+label')
260
+ st.plotly_chart(fig1, use_container_width=True)
261
+ if comet_initialized and experiment: # Check if experiment is initialized
262
+ experiment.log_figure(figure=fig1, figure_name="label_pie_chart")
263
+
264
+ with col2:
265
+ st.subheader("Bar Chart", divider="rainbow")
266
+ fig2 = px.bar(value_counts, x="count", y="entity_group", color="entity_group",
267
+ text_auto=True, title='Occurrences of Predicted Labels')
268
+ st.plotly_chart(fig2, use_container_width=True)
269
+ if comet_initialized and experiment: # Check if experiment is initialized
270
+ experiment.log_figure(figure=fig2, figure_name="label_bar_chart")
271
+ else:
272
+ st.warning("No entities were extracted from the provided text.")
273
+
274
+
275
+
276
+ dfa = pd.DataFrame(
277
+ data={
278
+ 'word': ['entity extracted from your text data'],
279
+ 'score': ['accuracy score; how accurately a tag has been assigned to a given entity'],
280
+ 'entity_group': ['label (tag) assigned to a given extracted entity'],
281
+ 'start': ['index of the start of the corresponding entity'],
282
+ 'end': ['index of the end of the corresponding entity'],
283
+ }
284
+ )
285
+ buf = io.BytesIO()
286
+ with zipfile.ZipFile(buf, "w") as myzip:
287
+ if not df.empty:
288
+ myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
289
+ myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
290
+
291
+ with stylable_container(
292
+ key="download_button",
293
+ css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
294
+ ):
295
+ st.download_button(
296
+ label="Download zip file",
297
+ data=buf.getvalue(),
298
+ file_name="nlpblogs_ner_results.zip",
299
+ mime="application/zip",)
300
+
301
+
302
+ st.divider()
303
+ else:
304
+ st.warning("No meaningful text found to process. Please enter a URL or text.")
305
+
306
+
307
+ except Exception as e:
308
+ st.error(f"An unexpected error occurred: {e}")
309
+ finally:
310
+ if comet_initialized and experiment:
311
+ experiment.end()
312
+
313
+ st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")