Spaces:

AIEcosystem
/

18-Chinese-NER-TXT-URL-Web-App

Sleeping

App Files Files Community

Maria Tsilimos commited on 25 days ago

Commit

099d76a

unverified ·

1 Parent(s): 029d585

Create app.py

Browse files

Files changed (1) hide show

app.py +313 -0

app.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import requests
+import streamlit as st
+from bs4 import BeautifulSoup
+import pandas as pd
+from transformers import pipeline
+import plotly.express as px
+import time
+import io
+import os
+from comet_ml import Experiment
+import zipfile
+import re
+from streamlit_extras.stylable_container import stylable_container
+import numpy as np
+st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
+COMET_API_KEY = os.environ.get("COMET_API_KEY")
+COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
+COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
+comet_initialized = False
+if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
+    comet_initialized = True
+st.subheader("18-Chinese Named Entity Recognition Web App", divider="rainbow")
+st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
+expander = st.expander("**Important notes on the 18-Chinese Named Entity Recognition Web App**")
+expander.write('''
+        **Named Entities:** This 18-Chinese Named Entity Recognition Web App predicts eighteen (18) labels ("**CARDINAL**: cardinal number”, “**DATE**: date”, “**EVENT**: event name”, “**FAC**: facilities”, “**GPE**: geopolitical entity”, "**LANGUAGE**: language", "**LAW**: law", "**LOC**: location", "**MONEY**: money", "**NORP**: ethnic, religious, political groups", "**ORDINAL**: ordinal number", "**ORG**: organization", "**PERCENT**: percent value", "**PERSON**: person", "**PRODUCT**: product", "**QUANTITY**: quantity", "**TIME**: time", "**WORK_OF_ART**: work of art"). Results are presented in an easy-to-read table, visualized in an interactive tree map, pie chart, and bar chart, and are available for download along with a Glossary of tags.
+        **How to Use:** Paste a URL, and then press Enter. If you type or paste text, just press Ctrl + Enter.
+        **Usage Limits:** You can request results up to 10 times.
+        **Customization:** To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts.
+        **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
+        For any errors or inquiries, please contact us at info@nlpblogs.com
+    ''')
+with st.sidebar:
+    container = st.container(border=True)
+    container.write("**Named Entity Recognition (NER)** is the task of extracting and tagging entities in text data. Entities can be persons, organizations, locations, countries, products, events etc.")
+    st.subheader("Related NLP Web Apps", divider="rainbow")
+    st.link_button("58-Italian-Named-Entity-Recognition-PDF-DOCX-Web App", "https://nlpblogs.com/shop/named-entity-recognition-ner/58-italian-named-entity-recognition-web-app/", type = "primary")
+if 'source_type_attempts' not in st.session_state:
+    st.session_state['source_type_attempts'] = 0
+max_attempts = 10
+def clear_url_input():
+    st.session_state.url = ""
+def clear_text_input():
+    st.session_state.my_text_area = ""
+url = st.text_input("Enter URL from the internet, and then press Enter:", key="url")
+st.button("Clear URL", on_click=clear_url_input)
+text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", key='my_text_area')
+st.button("Clear Text", on_click=clear_text_input)
+source_type = None
+input_content = None
+text_to_process = None
+if url:
+    source_type = 'url'
+    input_content = url
+elif text:
+    source_type = 'text'
+    input_content = text
+if source_type:
+    st.subheader("Results", divider = "rainbow")
+    if st.session_state['source_type_attempts'] >= max_attempts:
+        st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
+        st.stop()
+    st.session_state['source_type_attempts'] += 1
+    @st.cache_resource
+    def load_ner_model():
+        return pipeline("token-classification", model="ckiplab/albert-tiny-chinese-ner", aggregation_strategy="max")
+    model = load_ner_model()
+    experiment = None
+    try:
+        if source_type == 'url':
+            if not url.startswith(("http://", "https://")):
+                st.error("Please enter a valid URL starting with 'http://' or 'https://'.")
+            else:
+                with st.spinner(f"Fetching and parsing content from **{url}**...", show_time=True):
+                    f = requests.get(url, timeout=10)
+                    f.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
+                    soup = BeautifulSoup(f.text, 'html.parser')
+                    text_to_process = soup.get_text(separator=' ', strip=True)
+                    st.divider()
+                    st.write("**Input text content**")
+                    st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process)
+        elif source_type == 'text':
+            text_to_process = text
+            st.divider()
+            st.write("**Input text content**")
+            st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process)
+        if text_to_process and len(text_to_process.strip()) > 0:
+            with st.spinner("Analyzing text...", show_time=True):
+                entities = model(text_to_process)
+                data = []
+                for entity in entities:
+                    data.append({
+                        'word': entity['word'],
+                        'entity_group': entity['entity_group'],
+                        'score': entity['score'],
+                        'start': entity['start'], # Include start and end for download
+                        'end': entity['end']
+                    })
+                df = pd.DataFrame(data)
+                pattern = r'[^\w\s]'
+                df['word'] = df['word'].replace(pattern, '', regex=True)
+                df = df.replace('', 'Unknown')
+                st.dataframe(df)
+                if comet_initialized:
+                    experiment = Experiment(
+                        api_key=COMET_API_KEY,
+                        workspace=COMET_WORKSPACE,
+                        project_name=COMET_PROJECT_NAME,
+                    )
+                    experiment.log_parameter("input_source_type", source_type)
+                    experiment.log_parameter("input_content_length", len(input_content))
+                    experiment.log_table("predicted_entities", df)
+                with st.expander("See Glossary of tags"):
+                    st.write('''
+                    '**word**': ['entity extracted from your text data']
+                    '**score**': ['accuracy score; how accurately a tag has been assigned to a given entity']
+                    '**entity_group**': ['label (tag) assigned to a given extracted entity']
+                    '**start**': ['index of the start of the corresponding entity']
+                    '**end**': ['index of the end of the corresponding entity']
+                    ''')
+                entity_groups = {"CARDINAL": "cardinal number",
+                         "DATE": "date",
+                         "EVENT": "event name",
+                         "FAC": "facilities",
+                         "GPE": "geopolitical entity",
+                         "LANGUAGE": "language",
+                         "LAW": "law",
+                         "LOC": "location",
+                         "MONEY": "money",
+                         "NORP": "ethnic, religious, political groups",
+                         "ORDINAL": "ordinal number",
+                         "ORG": "organization",
+                         "PERCENT": "percent value",
+                         "PERSON": "person",
+                         "PRODUCT": "product",
+                         "QUANTITY": "quantity",
+                         "TIME": "time",
+                         "WORK_OF_ART": "work of art",
+                        }
+                st.subheader("Grouped entities", divider = "rainbow")
+                    # Convert entity_groups dictionary to a list of (key, title) tuples
+                entity_items = list(entity_groups.items())
+                    # Define how many tabs per row
+                tabs_per_row = 5
+                for i in range(0, len(entity_items), tabs_per_row):
+                    current_row_entities = entity_items[i : i + tabs_per_row]
+                    tab_titles = [item[1] for item in current_row_entities]
+                    tabs = st.tabs(tab_titles)
+                    for j, (entity_group_key, tab_title) in enumerate(current_row_entities):
+                        with tabs[j]:
+                            if entity_group_key in df["entity_group"].unique():
+                                df_filtered = df[df["entity_group"] == entity_group_key]
+                                st.dataframe(df_filtered, use_container_width=True)
+                            else:
+                                st.info(f"No '{tab_title}' entities found in the text.")
+                                st.dataframe(pd.DataFrame({
+                                        'entity_group': [entity_group_key],
+                                        'score': [np.nan],
+                                        'word': [np.nan],
+                                        'start': [np.nan],
+                                        'end': [np.nan]
+                                    }), hide_index=True)
+                st.divider()
+                if not df.empty:
+                    st.markdown("---")
+                    st.subheader("Treemap", divider="rainbow")
+                    fig = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'],
+                                     values='score', color='entity_group',
+                                     )
+                    fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+                    st.plotly_chart(fig, use_container_width=True)
+                    if comet_initialized and experiment:
+                        experiment.log_figure(figure=fig, figure_name="entity_treemap")
+                    value_counts = df['entity_group'].value_counts().reset_index()
+                    value_counts.columns = ['entity_group', 'count']
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.subheader("Pie Chart", divider="rainbow")
+                        fig1 = px.pie(value_counts, values='count', names='entity_group',
+                                      hover_data=['count'], labels={'count': 'count'},
+                                      title='Percentage of Predicted Labels')
+                        fig1.update_traces(textposition='inside', textinfo='percent+label')
+                        st.plotly_chart(fig1, use_container_width=True)
+                        if comet_initialized and experiment: # Check if experiment is initialized
+                            experiment.log_figure(figure=fig1, figure_name="label_pie_chart")
+                    with col2:
+                        st.subheader("Bar Chart", divider="rainbow")
+                        fig2 = px.bar(value_counts, x="count", y="entity_group", color="entity_group",
+                                      text_auto=True, title='Occurrences of Predicted Labels')
+                        st.plotly_chart(fig2, use_container_width=True)
+                        if comet_initialized and experiment: # Check if experiment is initialized
+                            experiment.log_figure(figure=fig2, figure_name="label_bar_chart")
+                else:
+                    st.warning("No entities were extracted from the provided text.")
+                dfa = pd.DataFrame(
+                    data={
+                        'word': ['entity extracted from your text data'],
+                        'score': ['accuracy score; how accurately a tag has been assigned to a given entity'],
+                        'entity_group': ['label (tag) assigned to a given extracted entity'],
+                        'start': ['index of the start of the corresponding entity'],
+                        'end': ['index of the end of the corresponding entity'],
+                    }
+                )
+                buf = io.BytesIO()
+                with zipfile.ZipFile(buf, "w") as myzip:
+                    if not df.empty:
+                        myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
+                    myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
+                with stylable_container(
+                     key="download_button",
+                     css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
+                 ):
+                    st.download_button(
+                         label="Download zip file",
+                         data=buf.getvalue(),
+                         file_name="nlpblogs_ner_results.zip",
+                         mime="application/zip",)
+                st.divider()
+        else:
+            st.warning("No meaningful text found to process. Please enter a URL or text.")
+    except Exception as e:
+        st.error(f"An unexpected error occurred: {e}")
+    finally:
+        if comet_initialized and experiment:
+            experiment.end()
+st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")