Pledge_Tracker / system /html2lines.py
yulongchen's picture
add
35b3f62
import sys
from time import sleep
import trafilatura
from trafilatura.meta import reset_caches
from trafilatura.settings import DEFAULT_CONFIG
import spacy
from lxml.etree import tostring
import lxml.etree
import spacy
import subprocess
try:
nlp = spacy.load("en_core_web_lg")
except OSError:
print("πŸ” Downloading spaCy model 'en_core_web_lg' ...")
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
nlp = spacy.load("en_core_web_lg")
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
MIN_CHAR = 50
MAX_CHAR = 5000
def get_page(url):
page = None
for _ in range(3):
try:
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
assert page is not None
print("Fetched " + url, file=sys.stderr)
break
except:
sleep(3)
return page
def url2lines(url):
page = get_page(url)
if page is None:
return []
lines = html2lines(page)
return lines
def line_correction(lines, max_size=100):
out_lines = []
for line in lines:
if len(line) < MIN_CHAR:
continue
if len(line) > max_size:
doc = nlp(
line[:MAX_CHAR]
) # We split lines into sentences, but for performance we take only the first 5k characters per line
stack = ""
for sent in doc.sents:
if len(stack) > 0:
stack += " "
stack += str(sent).strip()
if len(stack) > max_size:
out_lines.append(stack)
stack = ""
if (
len(stack) > MIN_CHAR
): # Ensure every lines in the out_lines suffice the MIN_CHAR restriction
out_lines.append(stack)
else:
out_lines.append(line)
return out_lines
def html2lines(page):
out_lines = []
if len(page.strip()) == 0 or page is None:
return out_lines
text = trafilatura.extract(page, config=DEFAULT_CONFIG)
reset_caches()
if text is None:
return out_lines
return text.split(
"\n"
) # We just spit out the entire page, so need to reformat later.
def html2metadata(url):
page = get_page(url)
metadata = trafilatura.extract_metadata(page)
return metadata.as_dict()
if __name__ == "__main__":
url = "https://www.bbc.co.uk/news/61407508"
metadata = html2metadata(url)
text = " ".join(html2lines(page))
print(metadata)