Spaces:
Sleeping
Sleeping
import sys | |
from time import sleep | |
import trafilatura | |
from trafilatura.meta import reset_caches | |
from trafilatura.settings import DEFAULT_CONFIG | |
import spacy | |
from lxml.etree import tostring | |
import lxml.etree | |
import spacy | |
import subprocess | |
try: | |
nlp = spacy.load("en_core_web_lg") | |
except OSError: | |
print("π Downloading spaCy model 'en_core_web_lg' ...") | |
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True) | |
nlp = spacy.load("en_core_web_lg") | |
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 | |
MIN_CHAR = 50 | |
MAX_CHAR = 5000 | |
def get_page(url): | |
page = None | |
for _ in range(3): | |
try: | |
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) | |
assert page is not None | |
print("Fetched " + url, file=sys.stderr) | |
break | |
except: | |
sleep(3) | |
return page | |
def url2lines(url): | |
page = get_page(url) | |
if page is None: | |
return [] | |
lines = html2lines(page) | |
return lines | |
def line_correction(lines, max_size=100): | |
out_lines = [] | |
for line in lines: | |
if len(line) < MIN_CHAR: | |
continue | |
if len(line) > max_size: | |
doc = nlp( | |
line[:MAX_CHAR] | |
) # We split lines into sentences, but for performance we take only the first 5k characters per line | |
stack = "" | |
for sent in doc.sents: | |
if len(stack) > 0: | |
stack += " " | |
stack += str(sent).strip() | |
if len(stack) > max_size: | |
out_lines.append(stack) | |
stack = "" | |
if ( | |
len(stack) > MIN_CHAR | |
): # Ensure every lines in the out_lines suffice the MIN_CHAR restriction | |
out_lines.append(stack) | |
else: | |
out_lines.append(line) | |
return out_lines | |
def html2lines(page): | |
out_lines = [] | |
if len(page.strip()) == 0 or page is None: | |
return out_lines | |
text = trafilatura.extract(page, config=DEFAULT_CONFIG) | |
reset_caches() | |
if text is None: | |
return out_lines | |
return text.split( | |
"\n" | |
) # We just spit out the entire page, so need to reformat later. | |
def html2metadata(url): | |
page = get_page(url) | |
metadata = trafilatura.extract_metadata(page) | |
return metadata.as_dict() | |
if __name__ == "__main__": | |
url = "https://www.bbc.co.uk/news/61407508" | |
metadata = html2metadata(url) | |
text = " ".join(html2lines(page)) | |
print(metadata) | |