Spaces:
Sleeping
Sleeping
File size: 1,437 Bytes
558c227 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from datetime import datetime
import re, trafilatura
from trafilatura.settings import DEFAULT_CONFIG
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
_URL_DATE_PATS = [
re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03
re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03
re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703
]
def _meta_date(url: str):
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
if not page:
return None
meta = trafilatura.extract_metadata(page)
if not meta or not meta.date:
return None
try:
return datetime.fromisoformat(meta.date)
except ValueError:
try:
return datetime.fromisoformat(meta.date.split("T")[0])
except Exception:
return None
def _regex_date(url: str):
for pat in _URL_DATE_PATS:
m = pat.search(url)
if m:
try:
return datetime(
int(m.group("y")), int(m.group("m")), int(m.group("d"))
)
except ValueError:
pass
return None
def is_after_start(url: str, start_ymd: str) -> bool:
"""
- start_ymd: 'YYYYMMDD'
"""
t0 = datetime.strptime(start_ymd, "%Y%m%d")
pub_dt = _meta_date(url)
if pub_dt is None:
pub_dt = _regex_date(url)
if pub_dt is None:
return True
return pub_dt >= t0
|