File size: 2,527 Bytes
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import sys
from time import sleep
import trafilatura
from trafilatura.meta import reset_caches
from trafilatura.settings import DEFAULT_CONFIG
import spacy
from lxml.etree import tostring
import lxml.etree


import spacy
import subprocess

try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("πŸ” Downloading spaCy model 'en_core_web_lg' ...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
    nlp = spacy.load("en_core_web_lg")


DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
MIN_CHAR = 50
MAX_CHAR = 5000


def get_page(url):
    page = None
    for _ in range(3):
        try:
            page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
            assert page is not None
            print("Fetched " + url, file=sys.stderr)
            break
        except:
            sleep(3)
    return page


def url2lines(url):
    page = get_page(url)

    if page is None:
        return []

    lines = html2lines(page)
    return lines


def line_correction(lines, max_size=100):
    out_lines = []
    for line in lines:
        if len(line) < MIN_CHAR:
            continue

        if len(line) > max_size:
            doc = nlp(
                line[:MAX_CHAR]
            )  # We split lines into sentences, but for performance we take only the first 5k characters per line
            stack = ""
            for sent in doc.sents:
                if len(stack) > 0:
                    stack += " "
                stack += str(sent).strip()
                if len(stack) > max_size:
                    out_lines.append(stack)
                    stack = ""

            if (
                len(stack) > MIN_CHAR
            ):  # Ensure every lines in the out_lines suffice the MIN_CHAR restriction
                out_lines.append(stack)
        else:
            out_lines.append(line)

    return out_lines


def html2lines(page):
    out_lines = []

    if len(page.strip()) == 0 or page is None:
        return out_lines

    text = trafilatura.extract(page, config=DEFAULT_CONFIG)
    reset_caches()

    if text is None:
        return out_lines

    return text.split(
        "\n"
    )  # We just spit out the entire page, so need to reformat later.


def html2metadata(url):
    page = get_page(url)
    metadata = trafilatura.extract_metadata(page)
    return metadata.as_dict()

if __name__ == "__main__":
    url = "https://www.bbc.co.uk/news/61407508"
    metadata = html2metadata(url)
    text = " ".join(html2lines(page))
    print(metadata)