Pledge_Tracker / system /augmented_searching.py
yulongchen's picture
Add system
9426cdd
import json
import os
import requests
import pandas as pd
from pathlib import Path
from .date_verifier import is_after_start
def google_search(query, api_key, search_engine_id, start_date, end_date):
# print(f"[SYSTEM] Calling Google Search API for: {query}")
sort = f"date:r:{start_date}:{end_date}" #20241230:20250130
url = "https://www.googleapis.com/customsearch/v1"
params = {
"q": query,
"key": api_key,
"cx": search_engine_id,
"num": 10,
"sort": sort,
"cr": "countryUK",
"gl": "uk"
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
return response.json().get("items", [])
except Exception as e:
print(f"[ERROR] Google Search Failed: {e}")
return []
def save_tsv(file_name, id_value, string_value, value_list, query):
data = {
'ID': id_value,
'String': string_value,
'ListValue': value_list,
'query': query
}
df = pd.DataFrame(data)
df.to_csv(file_name, sep='\t', index=False, header=False)
def ensure_directory_exists(path):
dir_path = Path(path).expanduser().resolve().parent
if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
dir_path.mkdir(parents=True, exist_ok=True)
def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date):
if suggestion_meta==None:
qa_lines = open(f"{qa_file}","r").read()
qa_lines = json.loads(qa_lines)
claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})"
idx=0
else:
# claim_text = suggestion_meta["text"]
idx = suggestion_meta["index"]
qa_lines = open(f"{qa_file}","r").readlines()[idx]
qa_lines = json.loads(qa_lines)
claim_text = f"{qa_lines['claim']}"
api_key = os.environ.get("GOOGLE_API_KEY")
search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
if not api_key or not search_engine_id:
raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
# base_dir = pipeline_base_dir
tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
ensure_directory_exists(tsv_file_path)
urls = []
string_values = []
queries = []
questions = []
questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
questions = questions[:10]
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
for result in results:
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
string_values.append("claim")
urls.append(result["link"])
queries.append(f"{pledge_author}: {claim_text}")
for question in questions:
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
for result in results:
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
string_values.append("question")
urls.append(result["link"])
queries.append(f"{question}")
urls = list(dict.fromkeys(urls))
save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}")
return str(tsv_file_path)