Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

File size: 3,740 Bytes

35b3f62

import json
import os
import time
import requests
import pandas as pd
from datetime import datetime
from pathlib import Path
import spacy
import subprocess

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
    nlp = spacy.load("en_core_web_sm")

def clean_keywords(text):
    doc = nlp(text)
    keywords = []
    for chunk in doc.noun_chunks:
        words = [token.text for token in chunk if not token.is_stop and token.is_alpha]
        if words:
            cleaned_phrase = " ".join(words)
            if len(cleaned_phrase) > 2:
                keywords.append(cleaned_phrase)
    return list(set(keywords))

def google_search(query, api_key, search_engine_id, start_date, end_date):
    print(f"[SYSTEM] Calling Google Search API for: {query}")
    sort = f"date:r:{start_date}:{end_date}"
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "q": query,
        "key": api_key,
        "cx": search_engine_id,
        "num": 10,
        "sort": sort,
        "cr": "countryUK",
        "gl": "uk"
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json().get("items", [])
    except Exception as e:
        print(f"[ERROR] Google Search Failed: {e}")
        return []

def save_tsv(file_path, claim_id, claim_text, url_list):
    df = pd.DataFrame({
        'ID': [claim_id] * len(url_list),
        'String': ["claim"] * len(url_list),
        'ListValue': url_list,
        'query': [claim_text] * len(url_list)
    })
    df.to_csv(file_path, sep='\t', index=False, header=False)

def ensure_directory_exists(path):
    dir_path = Path(path).expanduser().resolve().parent
    if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
        raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
    dir_path.mkdir(parents=True, exist_ok=True)

def run_initial_searching(claim_text, pipeline_base_dir, start_date, end_date, user_id, claim_id):
    api_key = os.environ.get("GOOGLE_API_KEY")
    search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
    if not api_key or not search_engine_id:
        raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")

    base_dir = pipeline_base_dir
    manifesto_json_file = os.path.join(base_dir,"claim.json")
    tsv_file_path = os.path.join(base_dir,"initial_search_results.tsv")

    ensure_directory_exists(tsv_file_path)

    claim_record = {"claim_id": claim_id, "claim": claim_text}
    # if manifesto_json_file.exists():
    #     with open(manifesto_json_file, "r") as f:
    #         records = json.load(f)
    # else:
    records = []
    records.append(claim_record)
    with open(manifesto_json_file, "w") as f:
        json.dump(records, f, indent=1)

    urls = []
    results = google_search(f"{claim_text}", api_key, search_engine_id, start_date, end_date)
    urls += [r["link"] for r in results if "link" in r]
    keywords = clean_keywords(claim_text)
    keyword_text = " ".join(keywords)
    # for kw in keywords:
    #     results = google_search(kw, api_key, search_engine_id, start_date, end_date)
    #     urls += [r["link"] for r in results if "link" in r]
    results = google_search(keyword_text, api_key, search_engine_id, start_date, end_date)
    urls += [r["link"] for r in results if "link" in r]
    urls = list(dict.fromkeys(urls))

    save_tsv(str(tsv_file_path), claim_id, claim_text, urls)
    print(f"[SYSTEM] Saved {len(urls)} URLs for claim {claim_id} to {tsv_file_path}")
    return str(tsv_file_path), str(manifesto_json_file)