Spaces:
Sleeping
Sleeping
import json | |
import os | |
import requests | |
import pandas as pd | |
from pathlib import Path | |
from .date_verifier import is_after_start | |
def google_search(query, api_key, search_engine_id, start_date, end_date): | |
# print(f"[SYSTEM] Calling Google Search API for: {query}") | |
sort = f"date:r:{start_date}:{end_date}" #20241230:20250130 | |
url = "https://www.googleapis.com/customsearch/v1" | |
params = { | |
"q": query, | |
"key": api_key, | |
"cx": search_engine_id, | |
"num": 10, | |
"sort": sort, | |
"cr": "countryUK", | |
"gl": "uk" | |
} | |
try: | |
response = requests.get(url, params=params) | |
response.raise_for_status() | |
return response.json().get("items", []) | |
except Exception as e: | |
print(f"[ERROR] Google Search Failed: {e}") | |
return [] | |
def save_tsv(file_name, id_value, string_value, value_list, query): | |
data = { | |
'ID': id_value, | |
'String': string_value, | |
'ListValue': value_list, | |
'query': query | |
} | |
df = pd.DataFrame(data) | |
df.to_csv(file_name, sep='\t', index=False, header=False) | |
def ensure_directory_exists(path): | |
dir_path = Path(path).expanduser().resolve().parent | |
if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"): | |
raise ValueError(f"[ERROR] Unsafe path: {dir_path}") | |
dir_path.mkdir(parents=True, exist_ok=True) | |
def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date): | |
if suggestion_meta==None: | |
qa_lines = open(f"{qa_file}","r").read() | |
qa_lines = json.loads(qa_lines) | |
claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})" | |
idx=0 | |
else: | |
# claim_text = suggestion_meta["text"] | |
idx = suggestion_meta["index"] | |
qa_lines = open(f"{qa_file}","r").readlines()[idx] | |
qa_lines = json.loads(qa_lines) | |
claim_text = f"{qa_lines['claim']}" | |
api_key = os.environ.get("GOOGLE_API_KEY") | |
search_engine_id = os.environ.get("GOOGLE_SEARCH_CX") | |
if not api_key or not search_engine_id: | |
raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.") | |
# base_dir = pipeline_base_dir | |
tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv") | |
ensure_directory_exists(tsv_file_path) | |
urls = [] | |
string_values = [] | |
queries = [] | |
questions = [] | |
questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions] | |
questions = questions[:10] | |
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date) | |
for result in results: | |
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date): | |
string_values.append("claim") | |
urls.append(result["link"]) | |
queries.append(f"{pledge_author}: {claim_text}") | |
for question in questions: | |
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date) | |
for result in results: | |
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date): | |
string_values.append("question") | |
urls.append(result["link"]) | |
queries.append(f"{question}") | |
urls = list(dict.fromkeys(urls)) | |
save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries) | |
print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}") | |
return str(tsv_file_path) | |