Spaces:
Sleeping
Sleeping
from datetime import datetime | |
import re, trafilatura | |
from trafilatura.settings import DEFAULT_CONFIG | |
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 | |
_URL_DATE_PATS = [ | |
re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03 | |
re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03 | |
re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703 | |
] | |
def _meta_date(url: str): | |
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) | |
if not page: | |
return None | |
meta = trafilatura.extract_metadata(page) | |
if not meta or not meta.date: | |
return None | |
try: | |
return datetime.fromisoformat(meta.date) | |
except ValueError: | |
try: | |
return datetime.fromisoformat(meta.date.split("T")[0]) | |
except Exception: | |
return None | |
def _regex_date(url: str): | |
for pat in _URL_DATE_PATS: | |
m = pat.search(url) | |
if m: | |
try: | |
return datetime( | |
int(m.group("y")), int(m.group("m")), int(m.group("d")) | |
) | |
except ValueError: | |
pass | |
return None | |
def is_after_start(url: str, start_ymd: str) -> bool: | |
""" | |
- start_ymd: 'YYYYMMDD' | |
""" | |
t0 = datetime.strptime(start_ymd, "%Y%m%d") | |
pub_dt = _meta_date(url) | |
if pub_dt is None: | |
pub_dt = _regex_date(url) | |
if pub_dt is None: | |
return True | |
return pub_dt >= t0 | |