import json import os import requests import pandas as pd from pathlib import Path from .date_verifier import is_after_start def google_search(query, api_key, search_engine_id, start_date, end_date): # print(f"[SYSTEM] Calling Google Search API for: {query}") sort = f"date:r:{start_date}:{end_date}" #20241230:20250130 url = "https://www.googleapis.com/customsearch/v1" params = { "q": query, "key": api_key, "cx": search_engine_id, "num": 10, "sort": sort, "cr": "countryUK", "gl": "uk" } try: response = requests.get(url, params=params) response.raise_for_status() return response.json().get("items", []) except Exception as e: print(f"[ERROR] Google Search Failed: {e}") return [] def save_tsv(file_name, id_value, string_value, value_list, query): data = { 'ID': id_value, 'String': string_value, 'ListValue': value_list, 'query': query } df = pd.DataFrame(data) df.to_csv(file_name, sep='\t', index=False, header=False) def ensure_directory_exists(path): dir_path = Path(path).expanduser().resolve().parent if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"): raise ValueError(f"[ERROR] Unsafe path: {dir_path}") dir_path.mkdir(parents=True, exist_ok=True) def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date): if suggestion_meta==None: qa_lines = open(f"{qa_file}","r").read() qa_lines = json.loads(qa_lines) claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})" idx=0 else: # claim_text = suggestion_meta["text"] idx = suggestion_meta["index"] qa_lines = open(f"{qa_file}","r").readlines()[idx] qa_lines = json.loads(qa_lines) claim_text = f"{qa_lines['claim']}" api_key = os.environ.get("GOOGLE_API_KEY") search_engine_id = os.environ.get("GOOGLE_SEARCH_CX") if not api_key or not search_engine_id: raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.") # base_dir = pipeline_base_dir tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv") ensure_directory_exists(tsv_file_path) urls = [] string_values = [] queries = [] questions = [] questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions] questions = questions[:10] results = google_search(claim_text, api_key, search_engine_id, start_date, end_date) for result in results: if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date): string_values.append("claim") urls.append(result["link"]) queries.append(f"{pledge_author}: {claim_text}") for question in questions: results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date) for result in results: if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date): string_values.append("question") urls.append(result["link"]) queries.append(f"{question}") urls = list(dict.fromkeys(urls)) save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries) print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}") return str(tsv_file_path)