from datetime import datetime import re, trafilatura from trafilatura.settings import DEFAULT_CONFIG DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 _URL_DATE_PATS = [ re.compile(r"(?P\d{4})-(?P\d{2})-(?P\d{2})"), # 2025-07-03 re.compile(r"(?P\d{4})/(?P\d{2})/(?P\d{2})"), # 2025/07/03 re.compile(r"(?P\d{4})(?P\d{2})(?P\d{2})"), # 20250703 ] def _meta_date(url: str): page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) if not page: return None meta = trafilatura.extract_metadata(page) if not meta or not meta.date: return None try: return datetime.fromisoformat(meta.date) except ValueError: try: return datetime.fromisoformat(meta.date.split("T")[0]) except Exception: return None def _regex_date(url: str): for pat in _URL_DATE_PATS: m = pat.search(url) if m: try: return datetime( int(m.group("y")), int(m.group("m")), int(m.group("d")) ) except ValueError: pass return None def is_after_start(url: str, start_ymd: str) -> bool: """ - start_ymd: 'YYYYMMDD' """ t0 = datetime.strptime(start_ymd, "%Y%m%d") pub_dt = _meta_date(url) if pub_dt is None: pub_dt = _regex_date(url) if pub_dt is None: return True return pub_dt >= t0