Spaces:
Runtime error
Runtime error
🚸 kw based file naming
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
utils.py
CHANGED
|
@@ -9,6 +9,12 @@ from pathlib import Path
|
|
| 9 |
|
| 10 |
import torch
|
| 11 |
from natsort import natsorted
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
def validate_pytorch2(torch_version: str = None):
|
|
@@ -88,6 +94,57 @@ def load_example_filenames(example_path: str or Path):
|
|
| 88 |
return examples
|
| 89 |
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
def saves_summary(
|
| 92 |
summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
|
| 93 |
):
|
|
@@ -99,16 +156,18 @@ def saves_summary(
|
|
| 99 |
add_signature: whether to add a signature to the output file
|
| 100 |
kwargs: additional keyword arguments to include in the output file
|
| 101 |
"""
|
| 102 |
-
outpath = (
|
| 103 |
-
Path.cwd() / f"document_summary_{get_timestamp()}.txt"
|
| 104 |
-
if outpath is None
|
| 105 |
-
else Path(outpath)
|
| 106 |
-
)
|
| 107 |
sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
|
| 108 |
sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
|
| 109 |
scores_text = "\n".join(sum_scores)
|
| 110 |
full_summary = "\n".join(sum_text)
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
with open(
|
| 113 |
outpath,
|
| 114 |
"w",
|
|
|
|
| 9 |
|
| 10 |
import torch
|
| 11 |
from natsort import natsorted
|
| 12 |
+
from typing import List
|
| 13 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 14 |
+
from itertools import combinations
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
from rapidfuzz import fuzz
|
| 17 |
+
from nltk.corpus import stopwords
|
| 18 |
|
| 19 |
|
| 20 |
def validate_pytorch2(torch_version: str = None):
|
|
|
|
| 94 |
return examples
|
| 95 |
|
| 96 |
|
| 97 |
+
def extract_keywords(text: str, num_keywords: int = 3) -> List[str]:
|
| 98 |
+
"""
|
| 99 |
+
Extracts keywords from a text using the TextRank algorithm.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
text: The text to extract keywords from.
|
| 103 |
+
num_keywords: The number of keywords to extract. Default is 5.
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
A list of strings, where each string is a keyword extracted from the input text.
|
| 107 |
+
"""
|
| 108 |
+
# Remove stopwords from the input text
|
| 109 |
+
stop_words = set(stopwords.words("english"))
|
| 110 |
+
text = " ".join([word for word in text.lower().split() if word not in stop_words])
|
| 111 |
+
|
| 112 |
+
# Tokenize the text into sentences and words
|
| 113 |
+
sentences = sent_tokenize(text)
|
| 114 |
+
words = [word_tokenize(sentence) for sentence in sentences]
|
| 115 |
+
|
| 116 |
+
# Filter out words that are shorter than 3 characters
|
| 117 |
+
words = [[word for word in sentence if len(word) >= 3] for sentence in words]
|
| 118 |
+
|
| 119 |
+
# Create a graph of word co-occurrences
|
| 120 |
+
cooccur = defaultdict(lambda: defaultdict(int))
|
| 121 |
+
for sentence in words:
|
| 122 |
+
for w1, w2 in combinations(sentence, 2):
|
| 123 |
+
cooccur[w1][w2] += 1
|
| 124 |
+
cooccur[w2][w1] += 1
|
| 125 |
+
|
| 126 |
+
# Assign scores to words using the TextRank algorithm
|
| 127 |
+
scores = defaultdict(float)
|
| 128 |
+
for i in range(10):
|
| 129 |
+
for word in cooccur:
|
| 130 |
+
score = 0.15 + 0.85 * sum(
|
| 131 |
+
cooccur[word][other] / sum(cooccur[other].values()) * scores[other]
|
| 132 |
+
for other in cooccur[word]
|
| 133 |
+
)
|
| 134 |
+
scores[word] = score
|
| 135 |
+
|
| 136 |
+
# Sort the words by score and return the top num_keywords keywords
|
| 137 |
+
keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
|
| 138 |
+
|
| 139 |
+
# Use fuzzy matching to remove similar keywords
|
| 140 |
+
final_keywords = []
|
| 141 |
+
for keyword in keywords:
|
| 142 |
+
if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
|
| 143 |
+
final_keywords.append(keyword)
|
| 144 |
+
|
| 145 |
+
return final_keywords
|
| 146 |
+
|
| 147 |
+
|
| 148 |
def saves_summary(
|
| 149 |
summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
|
| 150 |
):
|
|
|
|
| 156 |
add_signature: whether to add a signature to the output file
|
| 157 |
kwargs: additional keyword arguments to include in the output file
|
| 158 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
|
| 160 |
sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
|
| 161 |
scores_text = "\n".join(sum_scores)
|
| 162 |
full_summary = "\n".join(sum_text)
|
| 163 |
|
| 164 |
+
keywords = "_".join(extract_keywords(full_summary))
|
| 165 |
+
outpath = (
|
| 166 |
+
Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
|
| 167 |
+
if outpath is None
|
| 168 |
+
else Path(outpath)
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
with open(
|
| 172 |
outpath,
|
| 173 |
"w",
|