Pledge_Tracker / system /augmented_searching.py
yulongchen's picture
Add system
d210108
raw
history blame
3.65 kB
import json
import os
import time
import requests
import pandas as pd
from datetime import datetime
from pathlib import Path
import spacy
def google_search(query, api_key, search_engine_id, start_date, end_date):
# print(f"[SYSTEM] Calling Google Search API for: {query}")
sort = f"date:r:{start_date}:{end_date}"
url = "https://www.googleapis.com/customsearch/v1"
params = {
"q": query,
"key": api_key,
"cx": search_engine_id,
"num": 10,
"sort": sort,
"cr": "countryUK",
"gl": "uk"
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
return response.json().get("items", [])
except Exception as e:
print(f"[ERROR] Google Search Failed: {e}")
return []
def save_tsv(file_name, id_value, string_value, value_list, query):
data = {
'ID': id_value,
'String': string_value,
'ListValue': value_list,
'query': query
}
df = pd.DataFrame(data)
df.to_csv(file_name, sep='\t', index=False, header=False)
def ensure_directory_exists(path):
dir_path = Path(path).expanduser().resolve().parent
if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
dir_path.mkdir(parents=True, exist_ok=True)
def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date):
if suggestion_meta==None:
qa_lines = open(f"{qa_file}","r").read()
qa_lines = json.loads(qa_lines)
claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})"
idx=0
else:
# claim_text = suggestion_meta["text"]
idx = suggestion_meta["index"]
qa_lines = open(f"{qa_file}","r").readlines()[idx]
qa_lines = json.loads(qa_lines)
claim_text = f"{qa_lines['claim']}"
api_key = os.environ.get("GOOGLE_API_KEY")
search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
if not api_key or not search_engine_id:
raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
# base_dir = pipeline_base_dir
tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
ensure_directory_exists(tsv_file_path)
urls = []
string_values = []
queries = []
questions = []
questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
questions = questions[:10]
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
for result in results:
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
string_values.append("claim")
urls.append(result["link"])
queries.append(f"{pledge_author}: {claim_text}")
for question in questions:
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
for result in results:
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
string_values.append("question")
urls.append(result["link"])
queries.append(f"{question}")
urls = list(dict.fromkeys(urls))
save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}")
return str(tsv_file_path)