yulongchen commited on
Commit
558c227
·
1 Parent(s): 273bc17

Add system

Browse files
system/augmented_searching.py CHANGED
@@ -6,10 +6,11 @@ import pandas as pd
6
  from datetime import datetime
7
  from pathlib import Path
8
  import spacy
 
9
 
10
  def google_search(query, api_key, search_engine_id, start_date, end_date):
11
  # print(f"[SYSTEM] Calling Google Search API for: {query}")
12
- sort = f"date:r:{start_date}:{end_date}"
13
  url = "https://www.googleapis.com/customsearch/v1"
14
  params = {
15
  "q": query,
@@ -80,7 +81,7 @@ def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_
80
 
81
  results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
82
  for result in results:
83
- if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
84
  string_values.append("claim")
85
  urls.append(result["link"])
86
  queries.append(f"{pledge_author}: {claim_text}")
@@ -88,7 +89,7 @@ def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_
88
  for question in questions:
89
  results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
90
  for result in results:
91
- if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
92
  string_values.append("question")
93
  urls.append(result["link"])
94
  queries.append(f"{question}")
 
6
  from datetime import datetime
7
  from pathlib import Path
8
  import spacy
9
+ from date_verifier import is_after_start
10
 
11
  def google_search(query, api_key, search_engine_id, start_date, end_date):
12
  # print(f"[SYSTEM] Calling Google Search API for: {query}")
13
+ sort = f"date:r:{start_date}:{end_date}" #20241230:20250130
14
  url = "https://www.googleapis.com/customsearch/v1"
15
  params = {
16
  "q": query,
 
81
 
82
  results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
83
  for result in results:
84
+ if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
85
  string_values.append("claim")
86
  urls.append(result["link"])
87
  queries.append(f"{pledge_author}: {claim_text}")
 
89
  for question in questions:
90
  results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
91
  for result in results:
92
+ if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
93
  string_values.append("question")
94
  urls.append(result["link"])
95
  queries.append(f"{question}")
system/date_verifier.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import re, trafilatura
3
+ from trafilatura.settings import DEFAULT_CONFIG
4
+
5
+ DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
6
+
7
+ _URL_DATE_PATS = [
8
+ re.compile(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"), # 2025-07-03
9
+ re.compile(r"(?P<y>\d{4})/(?P<m>\d{2})/(?P<d>\d{2})"), # 2025/07/03
10
+ re.compile(r"(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})"), # 20250703
11
+ ]
12
+
13
+ def _meta_date(url: str):
14
+
15
+ page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
16
+ if not page:
17
+ return None
18
+ meta = trafilatura.extract_metadata(page)
19
+ if not meta or not meta.date:
20
+ return None
21
+ try:
22
+ return datetime.fromisoformat(meta.date)
23
+ except ValueError:
24
+
25
+ try:
26
+ return datetime.fromisoformat(meta.date.split("T")[0])
27
+ except Exception:
28
+ return None
29
+
30
+ def _regex_date(url: str):
31
+
32
+ for pat in _URL_DATE_PATS:
33
+ m = pat.search(url)
34
+ if m:
35
+ try:
36
+ return datetime(
37
+ int(m.group("y")), int(m.group("m")), int(m.group("d"))
38
+ )
39
+ except ValueError:
40
+ pass
41
+ return None
42
+
43
+
44
+ def is_after_start(url: str, start_ymd: str) -> bool:
45
+ """
46
+ - start_ymd: 'YYYYMMDD'
47
+ """
48
+ t0 = datetime.strptime(start_ymd, "%Y%m%d")
49
+
50
+ pub_dt = _meta_date(url)
51
+
52
+ if pub_dt is None:
53
+ pub_dt = _regex_date(url)
54
+
55
+ if pub_dt is None:
56
+ return True
57
+
58
+ return pub_dt >= t0