Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

File size: 9,279 Bytes

from huggingface_hub import login
from datetime import datetime
import os, time
import pandas as pd

from system.initial_searching import run_initial_searching
from system.scraper import run_scraper
from system.hero_pipeline import run_hero_pipeline, run_hero_reranking
from system.augmented_searching import run_augmented_searching
from system.generate_output import process_manifesto_data_with_metadata
from system.ee import run_gpt4_event_extraction
from system.process_time import extract_and_sort_events
import spacy
import subprocess
from huggingface_hub import hf_hub_download
import json

try:
    spacy.load("en_core_web_sm")
except OSError:
    print("🔁 Downloading en_core_web_sm model ...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")


def count_total_events(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        results = json.load(f)

    total_events = 0
    for result in results:
        total_events+= len(result["output"]["events"])

    print(f"{total_events} events in total")
    return total_events

def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_id, update_fn=None, suggestion_meta=None):
    pipeline_base_dir = f"outputs/{timestamp}_{user_id}"
    os.makedirs(pipeline_base_dir, exist_ok=True)

    step_id=1

    # Step 1: Google 搜索
    if suggestion_meta==None:
        

        print("🔍 Step 1: Initial searching ...")
        initial_tsv_file, claim_json_path = run_initial_searching(
            claim_text=f"{pledge_author} : {claim} ({pledge_date})",
            # pledge_author=pledge_author,
            pipeline_base_dir=pipeline_base_dir,
            start_date=start_date,
            end_date="",
            user_id=user_id,
            claim_id=0,
        )
        with open(initial_tsv_file, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f)
        if update_fn:
            update_fn(step_id, f"{line_count} URLs are retrieved")
            step_id+=1
        

        print("🌐 Step 2: Scraping URLs ...")
        initial_data_store_dir = os.path.join(pipeline_base_dir, "initial_data_store")
        os.makedirs(initial_data_store_dir, exist_ok=True)
        initial_scraped_output_path = os.path.join(initial_data_store_dir, "0.jsonl")
        run_scraper(initial_tsv_file, initial_scraped_output_path)

        with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
        if update_fn: 
            update_fn(step_id, f"{line_count} URL pages have been successefully scraped")
            step_id+=1


        print("🧠 Step 3: HerO processing ...")
        hero_output_dir = os.path.join(pipeline_base_dir, "hero")
        os.makedirs(hero_output_dir, exist_ok=True)
        run_hero_pipeline(pipeline_base_dir)    

        qa_file_path = os.path.join(hero_output_dir, "manifesto_icl_top_k_qa.json")

        with open(qa_file_path, "r", encoding="utf-8") as f:
            questions = {line["question"] for line in json.load(f)["evidence"]}
            questions = list(questions)
            line_count = len(questions)
        if update_fn: 
            update_fn(step_id, f"{line_count} relevant queries are generated, for example:\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;1. {questions[0]}\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;2. {questions[1]}\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;3. {questions[2]}\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;4. {questions[3]}\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;5. {questions[4]}")
            step_id+=1

    else: 
        claim_json_path = None
        initial_scraped_output_path = None
        initial_tsv_file = None
        hero_output_dir = None
        qa_file_path = hf_hub_download(
                repo_id="PledgeTracker/demo_feedback",   
                filename="manifesto_with_QA_icl_top_k_qa.json",            
                repo_type="dataset",                     
                token=os.environ["HF_TOKEN"]            
            )
        idx = suggestion_meta["index"]
        qa_lines = open(f"{qa_file_path}","r").readlines()[idx]
        questions = {line["question"] for line in json.loads(qa_lines)["evidence"]}
        questions = list(questions)
        line_count = len(questions)
        if update_fn: 
            update_fn(step_id, f"relevant queries are generated, for example:\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;1. {questions[0]}\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;2. {questions[1]}\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;3. {questions[2]}\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;4. {questions[3]}\n"
                      f"&nbsp;&nbsp;&nbsp;&nbsp;5. {questions[4]}")
            step_id+=1

    try:
        augmented_tsv_file = run_augmented_searching(
            qa_file=qa_file_path,
            pledge_author=pledge_author,
            pledge_date=pledge_date,
            pipeline_base_dir=pipeline_base_dir,
            start_date=start_date,
            suggestion_meta=suggestion_meta,
            end_date="",
        )

        

        with open(augmented_tsv_file, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f)
        if update_fn:
            update_fn(step_id, f"{line_count} URLs are retrieved")
            step_id+=1
    except Exception as e:
        if update_fn:
            update_fn(step_id, f"❌ run_augmented_searching failed: {e}")
        raise

    augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
    os.makedirs(augmented_data_store_dir, exist_ok=True)

    try: 
        augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
        run_scraper(augmented_tsv_file, augmented_scraped_output_path)

        with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
        if update_fn: 
            update_fn(step_id, f"{line_count} URL pages have been successefully scraped")
            step_id+=1
    except Exception as e:
        if update_fn:
            update_fn(step_id, f"❌ run_scraper failed: {e}")
        raise
    

    try:
        run_hero_reranking(pipeline_base_dir, suggestion_meta)

        meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
        all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
        unique_urls = set()
        with open(all_info_path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                docs = data.get("evidence", [])
                for doc in docs:
                    if "url" in doc:
                        unique_urls.add(doc["url"])
        if update_fn: 
            update_fn(step_id, f"{len(unique_urls)} documents are selected")
            step_id+=1
    except Exception as e:
        if update_fn:
            update_fn(step_id, f"❌ run_hero_reranking failed: {e}")
        raise
    
    try:
        extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)

        events_num = count_total_events(extracted_event_path)

        if update_fn: 
            update_fn(step_id, f"{events_num} events are extracted from those documents.")
            step_id+=1
    except Exception as e:
        if update_fn:
            update_fn(step_id, f"❌ Event extraction failed: {e}")
        raise


    print("📅 Sorting events temporally ...")

    
    sorted_events = extract_and_sort_events(
        data_dir=pipeline_base_dir, 
        pledge_date=pledge_date, 
        pledge_author=pledge_author, 
        claim=claim,
        suggestion_meta=suggestion_meta
        )

    df = pd.DataFrame(sorted_events)
    sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
    df.to_excel(sorted_event_path, index=False)
    print(sorted_event_path)

    if update_fn: 
        update_fn(step_id, "All done!")
        step_id += 1

    return {
        "claim_json": claim_json_path,
        "initial_scraped_jsonl": initial_scraped_output_path,
        "initial_tsv_file": initial_tsv_file,
        "hero_dir": hero_output_dir,
        "augmented_scraped_jsonl": augmented_scraped_output_path,
        "augmented_tsv_file": augmented_tsv_file,
        "meta_data_dir": meta_data_dir,
        "unsorted_events": extracted_event_path,
        "sorted_events": sorted_event_path,
        "step_id": step_id
    }


if __name__ == "__main__":
    start = time.time()

    if os.environ.get("HF_TOKEN"):
        login(token=os.environ["HF_TOKEN"])
    else:
        print("No Hugging Face token found in environment variable HF_TOKEN.")

    claim = "“We will support families with children by introducing free breakfast clubs in every primary school”"
    start_date = "20250504"
    timestamp = "xxxxx"
    user_id = "xxx"

    outputs = run_pipeline(claim, time_start, timestamp, user_id)
    print("🎯 Pipeline finished. Outputs:", outputs)
    print(f"⏱️ Total time: {time.time() - start:.2f} seconds")