Spaces:
Sleeping
Sleeping
Commit
·
558c227
1
Parent(s):
273bc17
Add system
Browse files- system/augmented_searching.py +4 -3
- system/date_verifier.py +58 -0
system/augmented_searching.py
CHANGED
@@ -6,10 +6,11 @@ import pandas as pd
|
|
6 |
from datetime import datetime
|
7 |
from pathlib import Path
|
8 |
import spacy
|
|
|
9 |
|
10 |
def google_search(query, api_key, search_engine_id, start_date, end_date):
|
11 |
# print(f"[SYSTEM] Calling Google Search API for: {query}")
|
12 |
-
sort = f"date:r:{start_date}:{end_date}"
|
13 |
url = "https://www.googleapis.com/customsearch/v1"
|
14 |
params = {
|
15 |
"q": query,
|
@@ -80,7 +81,7 @@ def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_
|
|
80 |
|
81 |
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
|
82 |
for result in results:
|
83 |
-
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
|
84 |
string_values.append("claim")
|
85 |
urls.append(result["link"])
|
86 |
queries.append(f"{pledge_author}: {claim_text}")
|
@@ -88,7 +89,7 @@ def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_
|
|
88 |
for question in questions:
|
89 |
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
|
90 |
for result in results:
|
91 |
-
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
|
92 |
string_values.append("question")
|
93 |
urls.append(result["link"])
|
94 |
queries.append(f"{question}")
|
|
|
6 |
from datetime import datetime
|
7 |
from pathlib import Path
|
8 |
import spacy
|
9 |
+
from date_verifier import is_after_start
|
10 |
|
11 |
def google_search(query, api_key, search_engine_id, start_date, end_date):
|
12 |
# print(f"[SYSTEM] Calling Google Search API for: {query}")
|
13 |
+
sort = f"date:r:{start_date}:{end_date}" #20241230:20250130
|
14 |
url = "https://www.googleapis.com/customsearch/v1"
|
15 |
params = {
|
16 |
"q": query,
|
|
|
81 |
|
82 |
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
|
83 |
for result in results:
|
84 |
+
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
|
85 |
string_values.append("claim")
|
86 |
urls.append(result["link"])
|
87 |
queries.append(f"{pledge_author}: {claim_text}")
|
|
|
89 |
for question in questions:
|
90 |
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
|
91 |
for result in results:
|
92 |
+
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
|
93 |
string_values.append("question")
|
94 |
urls.append(result["link"])
|
95 |
queries.append(f"{question}")
|
system/date_verifier.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
import re, trafilatura
|
3 |
+
from trafilatura.settings import DEFAULT_CONFIG
|
4 |
+
|
5 |
+
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
|
6 |
+
|
7 |
+
_URL_DATE_PATS = [
|
8 |
+
re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03
|
9 |
+
re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03
|
10 |
+
re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703
|
11 |
+
]
|
12 |
+
|
13 |
+
def _meta_date(url: str):
|
14 |
+
|
15 |
+
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
|
16 |
+
if not page:
|
17 |
+
return None
|
18 |
+
meta = trafilatura.extract_metadata(page)
|
19 |
+
if not meta or not meta.date:
|
20 |
+
return None
|
21 |
+
try:
|
22 |
+
return datetime.fromisoformat(meta.date)
|
23 |
+
except ValueError:
|
24 |
+
|
25 |
+
try:
|
26 |
+
return datetime.fromisoformat(meta.date.split("T")[0])
|
27 |
+
except Exception:
|
28 |
+
return None
|
29 |
+
|
30 |
+
def _regex_date(url: str):
|
31 |
+
|
32 |
+
for pat in _URL_DATE_PATS:
|
33 |
+
m = pat.search(url)
|
34 |
+
if m:
|
35 |
+
try:
|
36 |
+
return datetime(
|
37 |
+
int(m.group("y")), int(m.group("m")), int(m.group("d"))
|
38 |
+
)
|
39 |
+
except ValueError:
|
40 |
+
pass
|
41 |
+
return None
|
42 |
+
|
43 |
+
|
44 |
+
def is_after_start(url: str, start_ymd: str) -> bool:
|
45 |
+
"""
|
46 |
+
- start_ymd: 'YYYYMMDD'
|
47 |
+
"""
|
48 |
+
t0 = datetime.strptime(start_ymd, "%Y%m%d")
|
49 |
+
|
50 |
+
pub_dt = _meta_date(url)
|
51 |
+
|
52 |
+
if pub_dt is None:
|
53 |
+
pub_dt = _regex_date(url)
|
54 |
+
|
55 |
+
if pub_dt is None:
|
56 |
+
return True
|
57 |
+
|
58 |
+
return pub_dt >= t0
|