Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

File size: 3,749 Bytes

import json
import os
import requests
import pandas as pd
from pathlib import Path
from .date_verifier import is_after_start

def google_search(query, api_key, search_engine_id, start_date, end_date):
    # print(f"[SYSTEM] Calling Google Search API for: {query}")
    sort = f"date:r:{start_date}:{end_date}" #20241230:20250130
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "q": query,
        "key": api_key,
        "cx": search_engine_id,
        "num": 10,
        "sort": sort,
        "cr": "countryUK",
        "gl": "uk"
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json().get("items", [])
    except Exception as e:
        print(f"[ERROR] Google Search Failed: {e}")
        return []

def save_tsv(file_name, id_value, string_value, value_list, query):

    data = {
        'ID': id_value,
        'String': string_value,
        'ListValue': value_list,
        'query': query
    }
    df = pd.DataFrame(data)
    df.to_csv(file_name, sep='\t', index=False, header=False)

def ensure_directory_exists(path):
    dir_path = Path(path).expanduser().resolve().parent
    if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
        raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
    dir_path.mkdir(parents=True, exist_ok=True)

def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date):
    if suggestion_meta==None:
        qa_lines = open(f"{qa_file}","r").read()
        qa_lines = json.loads(qa_lines)
        claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})"
        idx=0
    else:
        # claim_text = suggestion_meta["text"]
        idx = suggestion_meta["index"]
        qa_lines = open(f"{qa_file}","r").readlines()[idx]
        qa_lines = json.loads(qa_lines)
        claim_text = f"{qa_lines['claim']}"
        

    api_key = os.environ.get("GOOGLE_API_KEY")
    search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
    if not api_key or not search_engine_id:
        raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")

    # base_dir = pipeline_base_dir

    tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
    ensure_directory_exists(tsv_file_path)


    urls = []
    string_values = []
    queries = []
    questions = []
    questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
    questions = questions[:10]


    results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
    for result in results:
        if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
            string_values.append("claim")
            urls.append(result["link"])
            queries.append(f"{pledge_author}: {claim_text}")
    
    for question in questions:
        results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
        for result in results:
            if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
                string_values.append("question")
                urls.append(result["link"])
                queries.append(f"{question}")

    urls = list(dict.fromkeys(urls))

    save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
    print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}")
    return str(tsv_file_path)