File size: 7,869 Bytes
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f95701
35b3f62
 
 
 
06d0c78
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d99e6bd
35b3f62
06d0c78
 
 
35b3f62
 
 
 
 
 
 
 
 
 
 
06d0c78
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
from huggingface_hub import login
from datetime import datetime
import os, time
import pandas as pd

from system.initial_searching import run_initial_searching
from system.scraper import run_scraper
from system.hero_pipeline import run_hero_pipeline, run_hero_reranking
from system.augmented_searching import run_augmented_searching
from system.generate_output import process_manifesto_data_with_metadata
from system.ee import run_gpt4_event_extraction
from system.process_time import extract_and_sort_events
import spacy
import subprocess
from huggingface_hub import hf_hub_download
import json

try:
    spacy.load("en_core_web_sm")
except OSError:
    print("πŸ” Downloading en_core_web_sm model ...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")


def count_total_events(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        results = json.load(f)

    total_events = 0
    for item in results:
        try:
            events = item["output"]
            if isinstance(events, list):
                total_events += len(events)
            else:
                print(f"invalid: {events}")
        except KeyError:
            print(f"lack item: {item}")

    print(f"{total_events} events in total")
    return total_events

def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_id, update_fn=None, suggestion_meta=None):
    pipeline_base_dir = f"outputs/{timestamp}_{user_id}"
    os.makedirs(pipeline_base_dir, exist_ok=True)

    step_id=1

    # Step 1: Google 搜紒
    if suggestion_meta==None:
        

        print("πŸ” Step 1: Initial searching ...")
        initial_tsv_file, claim_json_path = run_initial_searching(
            claim_text=f"{pledge_author} : {claim}",
            # pledge_author=pledge_author,
            pipeline_base_dir=pipeline_base_dir,
            start_date=start_date,
            end_date="",
            user_id=user_id,
            claim_id=0,
        )
        with open(initial_tsv_file, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f)
        if update_fn:
            update_fn(step_id, f"We have found {line_count} URLs")
            step_id+=1
        

        print("🌐 Step 2: Scraping URLs ...")
        initial_data_store_dir = os.path.join(pipeline_base_dir, "initial_data_store")
        os.makedirs(initial_data_store_dir, exist_ok=True)
        initial_scraped_output_path = os.path.join(initial_data_store_dir, "0.jsonl")
        run_scraper(initial_tsv_file, initial_scraped_output_path)

        with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
        if update_fn: 
            update_fn(step_id, f"We have scraped {line_count} URLs")
            step_id+=1


        print("🧠 Step 3: HerO processing ...")
        hero_output_dir = os.path.join(pipeline_base_dir, "hero")
        os.makedirs(hero_output_dir, exist_ok=True)
        run_hero_pipeline(pipeline_base_dir)    

        qa_file_path = os.path.join(hero_output_dir, "manifesto_icl_top_k_qa.json")

        with open(qa_file_path, "r", encoding="utf-8") as f:
            questions = {line["question"] for line in json.load(f)["evidence"]}
            line_count = len(questions)
        if update_fn: 
            update_fn(step_id, f"We have generated {line_count} search queries")
            step_id+=1

    else: 
        claim_json_path = None
        initial_scraped_output_path = None
        initial_tsv_file = None
        hero_output_dir = None
        qa_file_path = hf_hub_download(
                repo_id="PledgeTracker/demo_feedback",   
                filename="manifesto_with_QA_icl_top_k_qa.json",            
                repo_type="dataset",                     
                token=os.environ["HF_TOKEN"]            
            )
        print(qa_file_path)


    augmented_tsv_file = run_augmented_searching(
        qa_file=qa_file_path,
        pledge_author=pledge_author,
        pipeline_base_dir=pipeline_base_dir,
        start_date=start_date,
        suggestion_meta=suggestion_meta,
        end_date="",
        user_id=user_id,
        claim_id=0,
    )
    with open(augmented_tsv_file, "r", encoding="utf-8") as f:
        line_count = sum(1 for line in f)
    if update_fn:
        update_fn(step_id, f"We have found {line_count} URLs")
        step_id+=1

    augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
    os.makedirs(augmented_data_store_dir, exist_ok=True)
    augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
    run_scraper(augmented_tsv_file, augmented_scraped_output_path)

    with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
        line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
    if update_fn: 
        update_fn(step_id, f"We have scraped {line_count} URLs")
        step_id+=1
    

    run_hero_reranking(pipeline_base_dir, suggestion_meta)

    # Step 7: Preparing for GPT-4
    # print("🧠 Step 7: Processing format ...")

    meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)

    # Step 8: Event extraction using GPT-4
    print("🧠 Extracting events ...")

    all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
    unique_urls = set()
    with open(all_info_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            docs = data.get("evidence", [])
            for doc in docs:
                if "url" in doc:
                    unique_urls.add(doc["url"])
    if update_fn: 
        update_fn(step_id, f"We have found {len(unique_urls)} most relevant documents")
        step_id+=1
    
    extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)

    events_num = count_total_events(extracted_event_path)

    if update_fn: 
        update_fn(step_id, f"We have extracted {events_num} events from the documents.")
        step_id+=1


    # Step 9: Sorting events and label usefulness
    print("πŸ“… Sorting events temporally ...")

    
    sorted_events = extract_and_sort_events(
        data_dir=pipeline_base_dir, 
        pledge_date=pledge_date, 
        pledge_author=pledge_author, 
        claim=claim,
        suggestion_meta=suggestion_meta
        )
    print(sorted_events)
    df = pd.DataFrame(sorted_events)
    sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
    df.to_excel(sorted_event_path, index=False)
    print(sorted_event_path)

    # if update_fn: 
    #     update_fn(step_id, "All done!")
    #     step_id += 1

    return {
        "claim_json": claim_json_path,
        "initial_scraped_jsonl": initial_scraped_output_path,
        "initial_tsv_file": initial_tsv_file,
        "hero_dir": hero_output_dir,
        "augmented_scraped_jsonl": augmented_scraped_output_path,
        "augmented_tsv_file": augmented_tsv_file,
        "meta_data_dir": meta_data_dir,
        "unsorted_events": extracted_event_path,
        "sorted_events": sorted_event_path,
        "step_id": step_id
    }


if __name__ == "__main__":
    start = time.time()

    if os.environ.get("HF_TOKEN"):
        login(token=os.environ["HF_TOKEN"])
    else:
        print("No Hugging Face token found in environment variable HF_TOKEN.")

    claim = "β€œWe will support families with children by introducing free breakfast clubs in every primary school”"
    start_date = "20250504"
    timestamp = "xxxxx"
    user_id = "xxx"

    outputs = run_pipeline(claim, time_start, timestamp, user_id)
    print("🎯 Pipeline finished. Outputs:", outputs)
    print(f"⏱️ Total time: {time.time() - start:.2f} seconds")