File size: 1,437 Bytes
558c227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from datetime import datetime
import re, trafilatura
from trafilatura.settings import DEFAULT_CONFIG

DEFAULT_CONFIG.MAX_FILE_SIZE = 50000  

_URL_DATE_PATS = [
    re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"),  # 2025-07-03
    re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"),  # 2025/07/03
    re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"),    # 20250703
]

def _meta_date(url: str):

    page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
    if not page:
        return None
    meta = trafilatura.extract_metadata(page)
    if not meta or not meta.date:
        return None
    try:
        return datetime.fromisoformat(meta.date)
    except ValueError:

        try:
            return datetime.fromisoformat(meta.date.split("T")[0])
        except Exception:
            return None

def _regex_date(url: str):

    for pat in _URL_DATE_PATS:
        m = pat.search(url)
        if m:
            try:
                return datetime(
                    int(m.group("y")), int(m.group("m")), int(m.group("d"))
                )
            except ValueError:
                pass
    return None


def is_after_start(url: str, start_ymd: str) -> bool:
    """
    - start_ymd: 'YYYYMMDD'
    """
    t0 = datetime.strptime(start_ymd, "%Y%m%d")

    pub_dt = _meta_date(url)

    if pub_dt is None:
        pub_dt = _regex_date(url)

    if pub_dt is None:
        return True

    return pub_dt >= t0