Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

App Files Files Community

Pledge_Tracker / system /augmented_searching.py

yulongchen

Add system

9426cdd about 1 month ago

raw

history blame contribute delete

3.75 kB

	import json
	import os
	import requests
	import pandas as pd
	from pathlib import Path
	from .date_verifier import is_after_start

	def google_search(query, api_key, search_engine_id, start_date, end_date):
	# print(f"[SYSTEM] Calling Google Search API for: {query}")
	sort = f"date:r:{start_date}:{end_date}" #20241230:20250130
	url = "https://www.googleapis.com/customsearch/v1"
	params = {
	"q": query,
	"key": api_key,
	"cx": search_engine_id,
	"num": 10,
	"sort": sort,
	"cr": "countryUK",
	"gl": "uk"
	}
	try:
	response = requests.get(url, params=params)
	response.raise_for_status()
	return response.json().get("items", [])
	except Exception as e:
	print(f"[ERROR] Google Search Failed: {e}")
	return []

	def save_tsv(file_name, id_value, string_value, value_list, query):

	data = {
	'ID': id_value,
	'String': string_value,
	'ListValue': value_list,
	'query': query
	}
	df = pd.DataFrame(data)
	df.to_csv(file_name, sep='\t', index=False, header=False)

	def ensure_directory_exists(path):
	dir_path = Path(path).expanduser().resolve().parent
	if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
	raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
	dir_path.mkdir(parents=True, exist_ok=True)

	def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, pledge_date, start_date, end_date):
	if suggestion_meta==None:
	qa_lines = open(f"{qa_file}","r").read()
	qa_lines = json.loads(qa_lines)
	claim_text = f"{pledge_author}: {qa_lines['claim']} ({pledge_date})"
	idx=0
	else:
	# claim_text = suggestion_meta["text"]
	idx = suggestion_meta["index"]
	qa_lines = open(f"{qa_file}","r").readlines()[idx]
	qa_lines = json.loads(qa_lines)
	claim_text = f"{qa_lines['claim']}"


	api_key = os.environ.get("GOOGLE_API_KEY")
	search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
	if not api_key or not search_engine_id:
	raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")

	# base_dir = pipeline_base_dir

	tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
	ensure_directory_exists(tsv_file_path)


	urls = []
	string_values = []
	queries = []
	questions = []
	questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
	questions = questions[:10]


	results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
	for result in results:
	if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
	string_values.append("claim")
	urls.append(result["link"])
	queries.append(f"{pledge_author}: {claim_text}")

	for question in questions:
	results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
	for result in results:
	if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"] and is_after_start(result["link"], start_date):
	string_values.append("question")
	urls.append(result["link"])
	queries.append(f"{question}")

	urls = list(dict.fromkeys(urls))

	save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
	print(f"[SYSTEM] Saved {len(urls)} URLs for claim {idx} to {tsv_file_path}")
	return str(tsv_file_path)