Pledge_Tracker / system /date_verifier.py
yulongchen's picture
Add system
558c227
from datetime import datetime
import re, trafilatura
from trafilatura.settings import DEFAULT_CONFIG
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
_URL_DATE_PATS = [
re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03
re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03
re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703
]
def _meta_date(url: str):
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
if not page:
return None
meta = trafilatura.extract_metadata(page)
if not meta or not meta.date:
return None
try:
return datetime.fromisoformat(meta.date)
except ValueError:
try:
return datetime.fromisoformat(meta.date.split("T")[0])
except Exception:
return None
def _regex_date(url: str):
for pat in _URL_DATE_PATS:
m = pat.search(url)
if m:
try:
return datetime(
int(m.group("y")), int(m.group("m")), int(m.group("d"))
)
except ValueError:
pass
return None
def is_after_start(url: str, start_ymd: str) -> bool:
"""
- start_ymd: 'YYYYMMDD'
"""
t0 = datetime.strptime(start_ymd, "%Y%m%d")
pub_dt = _meta_date(url)
if pub_dt is None:
pub_dt = _regex_date(url)
if pub_dt is None:
return True
return pub_dt >= t0