Spaces:

fracapuano
/

AISandbox

Runtime error

App Files Files Community

fracapuano commited on Aug 31, 2023

Commit

a134869

1 Parent(s): a35034f

fix: summarization pipeline restructuring

Browse files

Files changed (1) hide show

summarization/summarization.py +88 -27

summarization/summarization.py CHANGED Viewed

@@ -1,43 +1,104 @@
 import streamlit as st
 from transformers import pipeline
 @st.cache_resource
-def summarization_model():
-    model_name = "google/pegasus-xsum"
     summarizer = pipeline(
         model=model_name,
-        tokenizer=model_name,
         task="summarization"
     )
     return summarizer
 def summarization_main():
-    st.markdown("<h2 style='text-align: center; color:grey;'>Text Summarization</h2>", unsafe_allow_html=True)
-    st.markdown("<h3 style='text-align: left; color:#F63366; font-size:18px;'><b>What is text summarization about?<b></h3>", unsafe_allow_html=True)
-    st.write("Text summarization is producing a shorter version of a given text while preserving its important information.")
-    st.markdown('___')
-    source = st.radio("How would you like to start? Choose an option below", ["I want to input some text", "I want to upload a file"])
-    if source == "I want to input some text":
         sample_text = ""
-        text = st.text_area("Input a text in English (10,000 characters max) or use the example below", value=sample_text, max_chars=10000, height=330)
-        button = st.button("Get summary")
-        if button:
-            with st.spinner(text="Loading summarization model..."):
-                summarizer = summarization_model()
-            with st.spinner(text="Summarizing text..."):
-                summary = summarizer(text, max_length=130, min_length=30)
-                st.text(summary[0]["summary_text"])
-    elif source == "I want to upload a file":
-        uploaded_file = st.file_uploader("Choose a .txt file to upload", type=["txt"])
         if uploaded_file is not None:
-            raw_text = str(uploaded_file.read(),"utf-8")
-            text = st.text_area("", value=raw_text, height=330)
-            button = st.button("Get summary")
             if button:
-                with st.spinner(text="Loading summarization model..."):
-                    summarizer = summarization_model()
                 with st.spinner(text="Summarizing text..."):
-                    summary = summarizer(text, max_length=130, min_length=30)
-                    st.text(summary[0]["summary_text"])

 import streamlit as st
 from transformers import pipeline
+from qa.qa import file_to_doc
+from transformers import AutoTokenizer
+from typing import Text, Union
 @st.cache_resource
+def summarization_model(
+    model_name:str="facebook/bart-large-cnn",
+    custom_tokenizer:Union[AutoTokenizer, bool]=False
+    ):
     summarizer = pipeline(
         model=model_name,
+        tokenizer=model_name if custom_tokenizer==False else custom_tokenizer,
         task="summarization"
     )
     return summarizer
+@st.cache_data
+def split_string_into_token_chunks(s:Text, _tokenizer:AutoTokenizer, chunk_size:int):
+    # Tokenize the entire string
+    token_ids = _tokenizer.encode(s)
+    # Split the token ids into chunks of the desired size
+    chunks = [token_ids[i:i+chunk_size] for i in range(0, len(token_ids), chunk_size)]
+    # Decode each chunk back into a string
+    return [_tokenizer.decode(chunk) for chunk in chunks]
 def summarization_main():
+    st.markdown("<h2 style='text-align: center'>Text Summarization</h2>", unsafe_allow_html=True)
+    st.markdown("<h3 style='text-align: left'><b>What is text summarization about?<b></h3>", unsafe_allow_html=True)
+    st.write("""
+        Text summarization is common NLP task concerned with producing a shorter version of a given text while preserving the important information
+        contained in such text
+        """)
+    OPTION_1 = "I want to input some text"
+    OPTION_2 = "I want to upload a file"
+    # option = st.radio("How would you like to start? Choose an option below", [OPTION_1, OPTION_2])
+    option = OPTION_2
+    # greenlight to summarize
+    text_is_given = False
+    if option == OPTION_1:
         sample_text = ""
+        text = st.text_area(
+            "Input a text in English (10,000 characters max)",
+            value=sample_text,
+            max_chars=10_000,
+            height=330)
+        # toggle text is given greenlight
+        text_is_given = not text_is_given
+    elif option == OPTION_2:
+        uploaded_file = st.file_uploader(
+                "Upload a pdf, docx, or txt file (scanned documents not supported)",
+                type=["pdf", "docx", "txt"],
+                help="Scanned documents are not supported yet 🥲"
+            )
         if uploaded_file is not None:
+            # parse the file using custom parsers and build a concatenation for the summarizer
+            text = " ".join(file_to_doc(uploaded_file))
+            # toggle text is given greenlight
+            text_is_given = not text_is_given
+    if text_is_given:
+        # minimal number of words in the summary
+        min_length, max_length = 30, 200
+        user_max_length = max_length
+        # user_max_lenght = st.slider(
+        #     label="Maximal number of tokens in the summary",
+        #     min_value=min_length,
+        #     max_value=max_length,
+        #     value=150,
+        #     step=10,
+        # )
+        summarizer_downloaded = False
+        # loading the tokenizer to split the input document into feasible chunks
+        model_name = "facebook/bart-large-cnn"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # the maximum number of tokens the model can handle depends on the model - accounting for tokens added by tokenizer
+        chunk_size = int(0.88*tokenizer.model_max_length)
+        # loading the summarization model considered
+        with st.spinner(text="Loading summarization model..."):
+            summarizer = summarization_model(model_name=model_name)
+            summarizer_downloaded = True
+        if summarizer_downloaded:
+            button = st.button("Summarize!")
             if button:
                 with st.spinner(text="Summarizing text..."):
+                    # summarizing each chunk of the input text to avoid exceeding the maximum number of tokens
+                    summary = ""
+                    chunks = split_string_into_token_chunks(text, tokenizer, chunk_size)
+                    for chunk in chunks:
+                        print(len(tokenizer.encode(chunk)))
+                        chunk_summary = summarizer(chunk, max_length=user_max_length, min_length=min_length)
+                        summary += chunk_summary[0]["summary_text"]
+                    st.markdown("<h3 style='text-align: left'><b>Summary<b></h3>", unsafe_allow_html=True)
+                    st.markdown(summary)