Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

File size: 2,527 Bytes

35b3f62

import sys
from time import sleep
import trafilatura
from trafilatura.meta import reset_caches
from trafilatura.settings import DEFAULT_CONFIG
import spacy
from lxml.etree import tostring
import lxml.etree


import spacy
import subprocess

try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("🔁 Downloading spaCy model 'en_core_web_lg' ...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
    nlp = spacy.load("en_core_web_lg")


DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
MIN_CHAR = 50
MAX_CHAR = 5000


def get_page(url):
    page = None
    for _ in range(3):
        try:
            page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
            assert page is not None
            print("Fetched " + url, file=sys.stderr)
            break
        except:
            sleep(3)
    return page


def url2lines(url):
    page = get_page(url)

    if page is None:
        return []

    lines = html2lines(page)
    return lines


def line_correction(lines, max_size=100):
    out_lines = []
    for line in lines:
        if len(line) < MIN_CHAR:
            continue

        if len(line) > max_size:
            doc = nlp(
                line[:MAX_CHAR]
            )  # We split lines into sentences, but for performance we take only the first 5k characters per line
            stack = ""
            for sent in doc.sents:
                if len(stack) > 0:
                    stack += " "
                stack += str(sent).strip()
                if len(stack) > max_size:
                    out_lines.append(stack)
                    stack = ""

            if (
                len(stack) > MIN_CHAR
            ):  # Ensure every lines in the out_lines suffice the MIN_CHAR restriction
                out_lines.append(stack)
        else:
            out_lines.append(line)

    return out_lines


def html2lines(page):
    out_lines = []

    if len(page.strip()) == 0 or page is None:
        return out_lines

    text = trafilatura.extract(page, config=DEFAULT_CONFIG)
    reset_caches()

    if text is None:
        return out_lines

    return text.split(
        "\n"
    )  # We just spit out the entire page, so need to reformat later.


def html2metadata(url):
    page = get_page(url)
    metadata = trafilatura.extract_metadata(page)
    return metadata.as_dict()

if __name__ == "__main__":
    url = "https://www.bbc.co.uk/news/61407508"
    metadata = html2metadata(url)
    text = " ".join(html2lines(page))
    print(metadata)