File size: 9,279 Bytes
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22ae8d0
 
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d210108
35b3f62
 
 
 
 
 
 
 
 
 
259448b
35b3f62
 
 
 
 
 
 
 
 
 
 
 
273bc17
35b3f62
 
 
 
 
 
 
 
 
 
 
 
25f652b
35b3f62
 
6a21780
efb4a48
 
 
 
 
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
602a998
 
da9a784
25f652b
602a998
6a21780
 
efb4a48
 
 
 
 
6a21780
35b3f62
90fcd9f
 
 
 
 
 
 
 
 
 
35b3f62
90fcd9f
602a998
90fcd9f
 
 
 
 
 
 
 
 
35b3f62
 
 
 
90fcd9f
 
 
35b3f62
90fcd9f
 
 
273bc17
90fcd9f
 
 
 
 
 
35b3f62
90fcd9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35b3f62
90fcd9f
 
35b3f62
90fcd9f
35b3f62
90fcd9f
 
 
 
 
 
 
35b3f62
 
 
 
 
 
 
 
 
 
 
 
90fcd9f
35b3f62
 
 
d99e6bd
35b3f62
22ae8d0
 
 
35b3f62
 
 
 
 
 
 
 
 
 
 
06d0c78
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from huggingface_hub import login
from datetime import datetime
import os, time
import pandas as pd

from system.initial_searching import run_initial_searching
from system.scraper import run_scraper
from system.hero_pipeline import run_hero_pipeline, run_hero_reranking
from system.augmented_searching import run_augmented_searching
from system.generate_output import process_manifesto_data_with_metadata
from system.ee import run_gpt4_event_extraction
from system.process_time import extract_and_sort_events
import spacy
import subprocess
from huggingface_hub import hf_hub_download
import json

try:
    spacy.load("en_core_web_sm")
except OSError:
    print("πŸ” Downloading en_core_web_sm model ...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")


def count_total_events(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        results = json.load(f)

    total_events = 0
    for result in results:
        total_events+= len(result["output"]["events"])

    print(f"{total_events} events in total")
    return total_events

def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_id, update_fn=None, suggestion_meta=None):
    pipeline_base_dir = f"outputs/{timestamp}_{user_id}"
    os.makedirs(pipeline_base_dir, exist_ok=True)

    step_id=1

    # Step 1: Google 搜紒
    if suggestion_meta==None:
        

        print("πŸ” Step 1: Initial searching ...")
        initial_tsv_file, claim_json_path = run_initial_searching(
            claim_text=f"{pledge_author} : {claim} ({pledge_date})",
            # pledge_author=pledge_author,
            pipeline_base_dir=pipeline_base_dir,
            start_date=start_date,
            end_date="",
            user_id=user_id,
            claim_id=0,
        )
        with open(initial_tsv_file, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f)
        if update_fn:
            update_fn(step_id, f"{line_count} URLs are retrieved")
            step_id+=1
        

        print("🌐 Step 2: Scraping URLs ...")
        initial_data_store_dir = os.path.join(pipeline_base_dir, "initial_data_store")
        os.makedirs(initial_data_store_dir, exist_ok=True)
        initial_scraped_output_path = os.path.join(initial_data_store_dir, "0.jsonl")
        run_scraper(initial_tsv_file, initial_scraped_output_path)

        with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
        if update_fn: 
            update_fn(step_id, f"{line_count} URL pages have been successefully scraped")
            step_id+=1


        print("🧠 Step 3: HerO processing ...")
        hero_output_dir = os.path.join(pipeline_base_dir, "hero")
        os.makedirs(hero_output_dir, exist_ok=True)
        run_hero_pipeline(pipeline_base_dir)    

        qa_file_path = os.path.join(hero_output_dir, "manifesto_icl_top_k_qa.json")

        with open(qa_file_path, "r", encoding="utf-8") as f:
            questions = {line["question"] for line in json.load(f)["evidence"]}
            questions = list(questions)
            line_count = len(questions)
        if update_fn: 
            update_fn(step_id, f"{line_count} relevant queries are generated, for example:\n"
                      f"    1. {questions[0]}\n"
                      f"    2. {questions[1]}\n"
                      f"    3. {questions[2]}\n"
                      f"    4. {questions[3]}\n"
                      f"    5. {questions[4]}")
            step_id+=1

    else: 
        claim_json_path = None
        initial_scraped_output_path = None
        initial_tsv_file = None
        hero_output_dir = None
        qa_file_path = hf_hub_download(
                repo_id="PledgeTracker/demo_feedback",   
                filename="manifesto_with_QA_icl_top_k_qa.json",            
                repo_type="dataset",                     
                token=os.environ["HF_TOKEN"]            
            )
        idx = suggestion_meta["index"]
        qa_lines = open(f"{qa_file_path}","r").readlines()[idx]
        questions = {line["question"] for line in json.loads(qa_lines)["evidence"]}
        questions = list(questions)
        line_count = len(questions)
        if update_fn: 
            update_fn(step_id, f"relevant queries are generated, for example:\n"
                      f"    1. {questions[0]}\n"
                      f"    2. {questions[1]}\n"
                      f"    3. {questions[2]}\n"
                      f"    4. {questions[3]}\n"
                      f"    5. {questions[4]}")
            step_id+=1

    try:
        augmented_tsv_file = run_augmented_searching(
            qa_file=qa_file_path,
            pledge_author=pledge_author,
            pledge_date=pledge_date,
            pipeline_base_dir=pipeline_base_dir,
            start_date=start_date,
            suggestion_meta=suggestion_meta,
            end_date="",
        )

        

        with open(augmented_tsv_file, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f)
        if update_fn:
            update_fn(step_id, f"{line_count} URLs are retrieved")
            step_id+=1
    except Exception as e:
        if update_fn:
            update_fn(step_id, f"❌ run_augmented_searching failed: {e}")
        raise

    augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
    os.makedirs(augmented_data_store_dir, exist_ok=True)

    try: 
        augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
        run_scraper(augmented_tsv_file, augmented_scraped_output_path)

        with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
            line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
        if update_fn: 
            update_fn(step_id, f"{line_count} URL pages have been successefully scraped")
            step_id+=1
    except Exception as e:
        if update_fn:
            update_fn(step_id, f"❌ run_scraper failed: {e}")
        raise
    

    try:
        run_hero_reranking(pipeline_base_dir, suggestion_meta)

        meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
        all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
        unique_urls = set()
        with open(all_info_path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                docs = data.get("evidence", [])
                for doc in docs:
                    if "url" in doc:
                        unique_urls.add(doc["url"])
        if update_fn: 
            update_fn(step_id, f"{len(unique_urls)} documents are selected")
            step_id+=1
    except Exception as e:
        if update_fn:
            update_fn(step_id, f"❌ run_hero_reranking failed: {e}")
        raise
    
    try:
        extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, max_tokens=100000)

        events_num = count_total_events(extracted_event_path)

        if update_fn: 
            update_fn(step_id, f"{events_num} events are extracted from those documents.")
            step_id+=1
    except Exception as e:
        if update_fn:
            update_fn(step_id, f"❌ Event extraction failed: {e}")
        raise


    print("πŸ“… Sorting events temporally ...")

    
    sorted_events = extract_and_sort_events(
        data_dir=pipeline_base_dir, 
        pledge_date=pledge_date, 
        pledge_author=pledge_author, 
        claim=claim,
        suggestion_meta=suggestion_meta
        )

    df = pd.DataFrame(sorted_events)
    sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
    df.to_excel(sorted_event_path, index=False)
    print(sorted_event_path)

    if update_fn: 
        update_fn(step_id, "All done!")
        step_id += 1

    return {
        "claim_json": claim_json_path,
        "initial_scraped_jsonl": initial_scraped_output_path,
        "initial_tsv_file": initial_tsv_file,
        "hero_dir": hero_output_dir,
        "augmented_scraped_jsonl": augmented_scraped_output_path,
        "augmented_tsv_file": augmented_tsv_file,
        "meta_data_dir": meta_data_dir,
        "unsorted_events": extracted_event_path,
        "sorted_events": sorted_event_path,
        "step_id": step_id
    }


if __name__ == "__main__":
    start = time.time()

    if os.environ.get("HF_TOKEN"):
        login(token=os.environ["HF_TOKEN"])
    else:
        print("No Hugging Face token found in environment variable HF_TOKEN.")

    claim = "β€œWe will support families with children by introducing free breakfast clubs in every primary school”"
    start_date = "20250504"
    timestamp = "xxxxx"
    user_id = "xxx"

    outputs = run_pipeline(claim, time_start, timestamp, user_id)
    print("🎯 Pipeline finished. Outputs:", outputs)
    print(f"⏱️ Total time: {time.time() - start:.2f} seconds")