document-summarization

Runtime error

App Files Files Community

pszemraj commited on Apr 30, 2023

Commit

471b053

1 Parent(s): e219aa1

🚸 kw based file naming

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

utils.py +64 -5

utils.py CHANGED Viewed

@@ -9,6 +9,12 @@ from pathlib import Path
 import torch
 from natsort import natsorted
 def validate_pytorch2(torch_version: str = None):
@@ -88,6 +94,57 @@ def load_example_filenames(example_path: str or Path):
     return examples
 def saves_summary(
     summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
 ):
@@ -99,16 +156,18 @@ def saves_summary(
     add_signature: whether to add a signature to the output file
     kwargs: additional keyword arguments to include in the output file
     """
-    outpath = (
-        Path.cwd() / f"document_summary_{get_timestamp()}.txt"
-        if outpath is None
-        else Path(outpath)
-    )
     sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
     sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
     scores_text = "\n".join(sum_scores)
     full_summary = "\n".join(sum_text)
     with open(
         outpath,
         "w",

 import torch
 from natsort import natsorted
+from typing import List
+from nltk.tokenize import sent_tokenize, word_tokenize
+from itertools import combinations
+from collections import defaultdict
+from rapidfuzz import fuzz
+from nltk.corpus import stopwords
 def validate_pytorch2(torch_version: str = None):
     return examples
+def extract_keywords(text: str, num_keywords: int = 3) -> List[str]:
+    """
+    Extracts keywords from a text using the TextRank algorithm.
+    Args:
+        text: The text to extract keywords from.
+        num_keywords: The number of keywords to extract. Default is 5.
+    Returns:
+        A list of strings, where each string is a keyword extracted from the input text.
+    """
+    # Remove stopwords from the input text
+    stop_words = set(stopwords.words("english"))
+    text = " ".join([word for word in text.lower().split() if word not in stop_words])
+    # Tokenize the text into sentences and words
+    sentences = sent_tokenize(text)
+    words = [word_tokenize(sentence) for sentence in sentences]
+    # Filter out words that are shorter than 3 characters
+    words = [[word for word in sentence if len(word) >= 3] for sentence in words]
+    # Create a graph of word co-occurrences
+    cooccur = defaultdict(lambda: defaultdict(int))
+    for sentence in words:
+        for w1, w2 in combinations(sentence, 2):
+            cooccur[w1][w2] += 1
+            cooccur[w2][w1] += 1
+    # Assign scores to words using the TextRank algorithm
+    scores = defaultdict(float)
+    for i in range(10):
+        for word in cooccur:
+            score = 0.15 + 0.85 * sum(
+                cooccur[word][other] / sum(cooccur[other].values()) * scores[other]
+                for other in cooccur[word]
+            )
+            scores[word] = score
+    # Sort the words by score and return the top num_keywords keywords
+    keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
+    # Use fuzzy matching to remove similar keywords
+    final_keywords = []
+    for keyword in keywords:
+        if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
+            final_keywords.append(keyword)
+    return final_keywords
 def saves_summary(
     summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
 ):
     add_signature: whether to add a signature to the output file
     kwargs: additional keyword arguments to include in the output file
     """
     sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
     sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
     scores_text = "\n".join(sum_scores)
     full_summary = "\n".join(sum_text)
+    keywords = "_".join(extract_keywords(full_summary))
+    outpath = (
+        Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
+        if outpath is None
+        else Path(outpath)
+    )
     with open(
         outpath,
         "w",