yulongchen commited on
Commit
35b3f62
·
1 Parent(s): 10e50a5
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+ COPY . /app
5
+
6
+ RUN pip install --no-cache-dir flask flask-cors pandas openpyxl
7
+
8
+ EXPOSE 7860
9
+
10
+ CMD ["python", "app.py"]
README copy.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Demo
3
+ emoji: 🔥
4
+ colorFrom: yellow
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.32.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,7 +1,362 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, jsonify, send_file, request, send_from_directory
2
+ from flask_cors import CORS
3
+ import os, json, uuid, time
4
+ import pandas as pd
5
+ from datetime import datetime, timedelta
6
+ from huggingface_hub import HfApi
7
+ import sys
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ from system.pledge_tracking import run_pipeline
11
+ from huggingface_hub import hf_hub_download
12
+ import spacy
13
+ import traceback
14
+ import threading
15
 
16
+ nlp = spacy.load("en_core_web_sm")
 
17
 
18
+ app = Flask(__name__, static_folder='.')
19
+ CORS(app)
20
+
21
+ HF_DATASET_REPO = "PledgeTracker/demo_feedback"
22
+ HF_TOKEN = os.environ.get("HF_TOKEN")
23
+ TMP_DIR = "tmp"
24
+ FEEDBACK_DIR = "feedback_logs"
25
+ os.makedirs(TMP_DIR, exist_ok=True)
26
+ os.makedirs(FEEDBACK_DIR, exist_ok=True)
27
+
28
+ REFERENCE_PLEDGES = []
29
+
30
+ REFERENCE_PLEDGE_PATH = hf_hub_download(
31
+ repo_id="PledgeTracker/demo_feedback",
32
+ filename="existing_pledges.txt",
33
+ repo_type="dataset",
34
+ token=os.environ["HF_TOKEN"]
35
+ )
36
+
37
+ if os.path.exists(REFERENCE_PLEDGE_PATH):
38
+ with open(REFERENCE_PLEDGE_PATH, "r") as f:
39
+ REFERENCE_PLEDGES = [line.strip() for line in f if line.strip()]
40
+ else:
41
+ print(f"Missing reference pledge file: {REFERENCE_PLEDGE_PATH}")
42
+
43
+
44
+ def lemmatize(text):
45
+ doc = nlp(text)
46
+ return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
47
+
48
+
49
+ @app.route("/api/similar-pledges", methods=["POST"])
50
+ def similar_pledges():
51
+ data = request.get_json()
52
+ claim = data.get("claim", "").strip()
53
+ if not claim or not REFERENCE_PLEDGES:
54
+ return jsonify({"suggestions": []})
55
+
56
+ all_pledges = [claim] + REFERENCE_PLEDGES
57
+ lemmatized_pledges = [lemmatize(p) for p in all_pledges]
58
+
59
+ vectorizer = TfidfVectorizer().fit_transform(lemmatized_pledges)
60
+ similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
61
+ filtered = [(i, similarities[i]) for i in range(len(similarities)) if similarities[i] > 0.3]
62
+ top_filtered = sorted(filtered, key=lambda x: x[1], reverse=True)[:5]
63
+
64
+ suggestions = [
65
+ {"text": REFERENCE_PLEDGES[i], "index": int(i)}
66
+ for i, score in top_filtered
67
+ ]
68
+
69
+ return jsonify({"suggestions": suggestions})
70
+
71
+
72
+ def calculate_time_range(option: str, pledge_date: str = None):
73
+ today = datetime.today()
74
+
75
+ if option == "week":
76
+ one_week_ago = today - timedelta(days=7)
77
+ start = max(one_week_ago, pledge_date)
78
+ elif option == "month":
79
+ one_month_ago = today - timedelta(days=30)
80
+ start = max(one_month_ago, pledge_date)
81
+ elif option == "year":
82
+ one_year_ago = today - timedelta(days=365)
83
+ start = max(one_year_ago, pledge_date)
84
+ elif option == "since_pledge_date":
85
+ if not pledge_date:
86
+ raise ValueError("Pledge date is required for 'since_pledge_date' option")
87
+ start = datetime.strptime(pledge_date, "%Y-%m-%d")
88
+ else:
89
+ raise ValueError("Invalid time range option")
90
+ print(start)
91
+ return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
92
+
93
+ @app.route("/")
94
+ def serve_html():
95
+ return send_from_directory('.', 'test.html')
96
+
97
+ @app.route("/api/status")
98
+ def check_status():
99
+ user_id = request.args.get("user_id")
100
+ timestamp = request.args.get("timestamp")
101
+ log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
102
+ if not os.path.exists(log_file_path):
103
+ return jsonify({"status": {}}), 200
104
+ try:
105
+ with open(log_file_path, "r") as f:
106
+ status = json.load(f)
107
+ except Exception:
108
+ status = {}
109
+
110
+ return jsonify({"status": status})
111
+
112
+
113
+ @app.route("/api/run-model", methods=["POST"])
114
+ def run_model():
115
+ data = request.get_json()
116
+ claim = data.get("claim", "no input")
117
+ time_range_option = data.get("time_range", "month")
118
+
119
+ suggestion_meta = data.get("suggestion_meta")
120
+ pledge_date = data.get("pledge_date", "")
121
+ pledge_author = data.get("pledge_author", "")
122
+ timestamp = data.get("timestamp") or time.strftime("%Y-%m-%d_%H-%M-%S")
123
+ user_id = data.get("user_id") or str(uuid.uuid4())[:8]
124
+
125
+ log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
126
+
127
+ status_lock = threading.Lock()
128
+
129
+ def update_status(step_id, msg):
130
+ print(f"[STATUS] Step {step_id}: {msg}")
131
+ with status_lock:
132
+ if os.path.exists(log_file_path):
133
+ try:
134
+ with open(log_file_path, "r") as f:
135
+ current = json.load(f)
136
+ except Exception:
137
+ current = {}
138
+ else:
139
+ current = {}
140
+ current[str(step_id)] += f": {msg}"
141
+ with open(log_file_path, "w") as f:
142
+ json.dump(current, f, indent=2)
143
+
144
+ try:
145
+ time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
146
+ print(f"[DEMO] Received claim: {claim}")
147
+ print(f"[DEMO] Time range: {time_start} ~ {time_end}")
148
+ print(f"[DEMO] Time range: {pledge_date}")
149
+
150
+ # user_id = str(uuid.uuid4())[:8]
151
+ # outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)
152
+
153
+
154
+ update_status(0, "📌 Starting the system ...")
155
+ print(suggestion_meta)
156
+
157
+ outputs = run_pipeline(
158
+ claim, pledge_date, pledge_author, time_start, timestamp, user_id,
159
+ update_fn=update_status, suggestion_meta=suggestion_meta
160
+ )
161
+
162
+ df = pd.read_excel(outputs["sorted_events"])
163
+ json_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}.json")
164
+ df.to_json(json_path, orient="records", indent=2)
165
+
166
+ events = df.to_dict(orient="records")
167
+ log_entry = {
168
+ "requested_time": timestamp,
169
+ "pledge": claim,
170
+ "suggestion_meta": suggestion_meta,
171
+ "user_id": user_id,
172
+ "pledge_author": pledge_author,
173
+ "pledge_date": pledge_date,
174
+ "events": events
175
+ }
176
+ default_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
177
+
178
+ with open(default_log_path, "w") as f:
179
+ f.write(json.dumps(log_entry, indent=1))
180
+
181
+ try:
182
+ api = HfApi()
183
+ api.upload_file(
184
+ path_or_fileobj=default_log_path,
185
+ path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
186
+ repo_id=HF_DATASET_REPO,
187
+ repo_type="dataset",
188
+ token=HF_TOKEN
189
+ )
190
+ update_status(7, "✅ done")
191
+
192
+ except Exception as e:
193
+ traceback.print_exc()
194
+ print(f"[Default Feedback Upload Error] {e}")
195
+
196
+ return jsonify({
197
+ "status": "success",
198
+ "file": f"{timestamp}_{user_id}.json",
199
+ "user_id": user_id,
200
+ "timestamp": timestamp
201
+ })
202
+ except Exception as e:
203
+ traceback.print_exc()
204
+ return jsonify({"status": "error", "detail": str(e)}), 500
205
+
206
+ @app.route("/api/events")
207
+ def get_events():
208
+ filename = request.args.get("file")
209
+ file_path = os.path.join(TMP_DIR, filename)
210
+
211
+ if not os.path.exists(file_path):
212
+ return jsonify({"error": "File not found"}), 404
213
+
214
+ with open(file_path, "r") as f:
215
+ events = json.load(f)
216
+
217
+ return jsonify(events)
218
+
219
+
220
+ @app.route("/api/feedback", methods=["POST"])
221
+ def receive_feedback():
222
+ data = request.get_json()
223
+ pledge = data.get("pledge", "no_pledge_text")
224
+ feedback_list = data.get("feedback", [])
225
+ filename = data.get("file")
226
+ file_path = os.path.join(TMP_DIR, filename)
227
+ pledge_date = data.get("pledge_date", "")
228
+ pledge_author = data.get("pledge_author", "")
229
+
230
+ if not os.path.exists(file_path):
231
+ return jsonify({"error": "Event file not found"}), 400
232
+
233
+ with open(file_path, "r") as f:
234
+ events = json.load(f)
235
+
236
+ # 直接存储反馈字符串值
237
+ feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list}
238
+
239
+ for idx, event in enumerate(events):
240
+ event["user_feedback"] = feedback_dict.get(idx)
241
+
242
+ log_entry = {
243
+ "requested_time": data.get("timestamp"),
244
+ "user_id": data.get("user_id"),
245
+ "pledge": pledge,
246
+ "pledge_author": pledge_author,
247
+ "pledge_date": pledge_date,
248
+ "events": events
249
+ }
250
+
251
+ timestamp = data.get("timestamp")
252
+ user_id = data.get("user_id")
253
+
254
+ if not user_id or not timestamp:
255
+ return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400
256
+
257
+ local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
258
+
259
+ with open(local_filename, "w") as f:
260
+ f.write(json.dumps(log_entry, indent=1))
261
+
262
+ try:
263
+ api = HfApi()
264
+ api.upload_file(
265
+ path_or_fileobj=local_filename,
266
+ path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
267
+ repo_id=HF_DATASET_REPO,
268
+ repo_type="dataset",
269
+ token=HF_TOKEN
270
+ )
271
+ except Exception as e:
272
+ return jsonify({'status': 'partial_success', 'error': str(e)}), 500
273
+
274
+ return jsonify({'status': 'success'})
275
+
276
+ # @app.route("/api/feedback", methods=["POST"])
277
+ # def receive_feedback():
278
+ # data = request.get_json()
279
+ # pledge = data.get("pledge", "no_pledge_text")
280
+ # feedback_list = data.get("feedback", [])
281
+ # filename = data.get("file")
282
+ # file_path = os.path.join(TMP_DIR, filename)
283
+ # pledge_date = data.get("pledge_date", "")
284
+ # pledge_author = data.get("pledge_author", "")
285
+
286
+ # if not os.path.exists(file_path):
287
+ # return jsonify({"error": "Event file not found"}), 400
288
+
289
+ # with open(file_path, "r") as f:
290
+ # events = json.load(f)
291
+
292
+ # feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list}
293
+
294
+ # for idx, event in enumerate(events):
295
+ # event["user_feedback"] = feedback_dict.get(idx)
296
+
297
+ # log_entry = {
298
+ # "requested_time": data.get("timestamp"),
299
+ # "user_id": data.get("user_id"),
300
+ # "pledge": pledge,
301
+ # "pledge_author": pledge_author,
302
+ # "pledge_date": pledge_date,
303
+ # "events": events
304
+ # }
305
+
306
+ # timestamp = data.get("timestamp")
307
+ # user_id = data.get("user_id")
308
+ # timestamp = data.get("timestamp")
309
+
310
+ # if not user_id or not timestamp:
311
+ # return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400
312
+
313
+ # local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
314
+
315
+ # with open(local_filename, "w") as f:
316
+ # f.write(json.dumps(log_entry, indent=1))
317
+
318
+ # try:
319
+ # api = HfApi()
320
+ # api.upload_file(
321
+ # path_or_fileobj=local_filename,
322
+ # path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
323
+ # repo_id=HF_DATASET_REPO,
324
+ # repo_type="dataset",
325
+ # token=HF_TOKEN
326
+ # )
327
+ # except Exception as e:
328
+ # return jsonify({'status': 'partial_success', 'error': str(e)}), 500
329
+
330
+ # return jsonify({'status': 'success'})
331
+
332
+ @app.route("/download-feedback/<filename>")
333
+ def download_feedback_file(filename):
334
+ return send_from_directory(FEEDBACK_DIR, filename, as_attachment=True)
335
+
336
+ @app.route("/feedback-files")
337
+ def list_feedback_files():
338
+ files = os.listdir(FEEDBACK_DIR)
339
+ return jsonify(sorted(files))
340
+
341
+ @app.route("/download")
342
+ def download_excel():
343
+ file = request.args.get("file")
344
+ if not file:
345
+ return "Missing file param", 400
346
+
347
+ json_path = os.path.join(TMP_DIR, file)
348
+ if not os.path.exists(json_path):
349
+ return "Event file not found", 404
350
+
351
+ with open(json_path, "r") as f:
352
+ data = json.load(f)
353
+
354
+ df = pd.DataFrame(data)
355
+ xlsx_path = os.path.join(TMP_DIR, file.replace(".json", ".xlsx"))
356
+ df.to_excel(xlsx_path, index=False)
357
+
358
+ return send_file(xlsx_path, as_attachment=True)
359
+
360
+
361
+ if __name__ == '__main__':
362
+ app.run(host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ flask_cors
3
+ pandas
4
+ openpyxl
5
+ huggingface_hub
6
+ PyMuPDF==1.23.25
7
+ huggingface_hub==0.30.2
8
+ lxml==5.3.1
9
+ nltk==3.9.1
10
+ numpy==2.2.6
11
+ openai==1.84.0
12
+ pandas==2.3.0
13
+ rank_bm25==0.2.2
14
+ Requests==2.32.3
15
+ scikit_learn==1.7.0
16
+ sentence_transformers==3.3.1
17
+ spacy==3.8.2
18
+ tiktoken==0.7.0
19
+ torch==2.6.0
20
+ tqdm
21
+ trafilatura==2.0.0
22
+ transformers==4.51.3
23
+ vllm==0.8.4
24
+ accelerate
25
+
system/.DS_Store ADDED
Binary file (8.2 kB). View file
 
system/__init__.py ADDED
File without changes
system/__pycache__/augmented_searching.cpython-312.pyc ADDED
Binary file (4.73 kB). View file
 
system/__pycache__/ee.cpython-312.pyc ADDED
Binary file (4.71 kB). View file
 
system/__pycache__/generate_output.cpython-312.pyc ADDED
Binary file (3.47 kB). View file
 
system/__pycache__/hero_pipeline.cpython-312.pyc ADDED
Binary file (6.22 kB). View file
 
system/__pycache__/html2lines.cpython-312.pyc ADDED
Binary file (3.15 kB). View file
 
system/__pycache__/initial_searching.cpython-312.pyc ADDED
Binary file (5.25 kB). View file
 
system/__pycache__/process_time.cpython-312.pyc ADDED
Binary file (8.95 kB). View file
 
system/__pycache__/scraper.cpython-312.pyc ADDED
Binary file (4.76 kB). View file
 
system/augmented_searching.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import requests
5
+ import pandas as pd
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ import spacy
9
+
10
+ def google_search(query, api_key, search_engine_id, start_date, end_date):
11
+ print(f"[SYSTEM] Calling Google Search API for: {query}")
12
+ sort = f"date:r:{start_date}:{end_date}"
13
+ url = "https://www.googleapis.com/customsearch/v1"
14
+ params = {
15
+ "q": query,
16
+ "key": api_key,
17
+ "cx": search_engine_id,
18
+ "num": 10,
19
+ "sort": sort,
20
+ "cr": "countryUK",
21
+ "gl": "uk"
22
+ }
23
+ try:
24
+ response = requests.get(url, params=params)
25
+ response.raise_for_status()
26
+ return response.json().get("items", [])
27
+ except Exception as e:
28
+ print(f"[ERROR] Google Search Failed: {e}")
29
+ return []
30
+
31
+ def save_tsv(file_name, id_value, string_value, value_list, query):
32
+
33
+ data = {
34
+ 'ID': id_value,
35
+ 'String': string_value,
36
+ 'ListValue': value_list,
37
+ 'query': query
38
+ }
39
+ df = pd.DataFrame(data)
40
+ df.to_csv(file_name, sep='\t', index=False, header=False)
41
+
42
+ def ensure_directory_exists(path):
43
+ dir_path = Path(path).expanduser().resolve().parent
44
+ if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
45
+ raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
46
+ dir_path.mkdir(parents=True, exist_ok=True)
47
+
48
+ def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, start_date, end_date, user_id, claim_id):
49
+ if suggestion_meta==None:
50
+ qa_lines = open(f"{qa_file}","r").read()
51
+ qa_lines = json.loads(qa_lines)
52
+ claim_text = f"{pledge_author}: {qa_lines['claim']}"
53
+ else:
54
+ # claim_text = suggestion_meta["text"]
55
+ idx = suggestion_meta["index"]
56
+ qa_lines = open(f"{qa_file}","r").readlines()[idx]
57
+ qa_lines = json.loads(qa_lines)
58
+ claim_text = f"{qa_lines['claim']}"
59
+ print(qa_lines)
60
+
61
+
62
+ api_key = os.environ.get("GOOGLE_API_KEY")
63
+ search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
64
+ if not api_key or not search_engine_id:
65
+ raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
66
+
67
+ # base_dir = pipeline_base_dir
68
+
69
+ tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
70
+ ensure_directory_exists(tsv_file_path)
71
+
72
+
73
+ urls = []
74
+ string_values = []
75
+ queries = []
76
+ questions = []
77
+ questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
78
+ questions = questions[:10]
79
+
80
+
81
+ results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
82
+ print(results)
83
+ for result in results:
84
+ if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
85
+ string_values.append("claim")
86
+ urls.append(result["link"])
87
+ queries.append(f"{pledge_author}: {claim_text}")
88
+
89
+ for question in questions:
90
+ results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
91
+ for result in results:
92
+ if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
93
+ string_values.append("question")
94
+ urls.append(result["link"])
95
+ queries.append(f"{question}")
96
+
97
+ urls = list(dict.fromkeys(urls))
98
+
99
+ save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
100
+ print(f"[SYSTEM] Saved {len(urls)} URLs for claim {claim_id} to {tsv_file_path}")
101
+ return str(tsv_file_path)
system/baseline/hyde_fc_generation_optimized.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vllm import LLM, SamplingParams
2
+ import json
3
+ import torch
4
+ import time
5
+ from datetime import datetime, timedelta
6
+ import argparse
7
+ from tqdm import tqdm
8
+ from typing import List, Dict, Any
9
+ import concurrent.futures
10
+
11
+ class VLLMGenerator:
12
+ def __init__(self, model_name: str, n: int = 8, max_tokens: int = 512,
13
+ temperature: float = 0.7, top_p: float = 1.0,
14
+ frequency_penalty: float = 0.0, presence_penalty: float = 0.0,
15
+ stop: List[str] = ['\n\n\n'], batch_size: int = 32):
16
+ self.device_count = torch.cuda.device_count()
17
+ print(f"Initializing with {self.device_count} GPUs")
18
+ self.llm = LLM(
19
+ model=model_name,
20
+ tensor_parallel_size=self.device_count,
21
+ max_model_len=4096,
22
+ gpu_memory_utilization=0.95,
23
+ enforce_eager=True,
24
+ trust_remote_code=True,
25
+ # quantization="bitsandbytes",
26
+ # dtype="half",
27
+ # load_format="bitsandbytes",
28
+ max_num_batched_tokens=4096,
29
+ max_num_seqs=batch_size
30
+ )
31
+ self.sampling_params = SamplingParams(
32
+ n=n,
33
+ max_tokens=max_tokens,
34
+ temperature=temperature,
35
+ top_p=top_p,
36
+ frequency_penalty=frequency_penalty,
37
+ presence_penalty=presence_penalty,
38
+ stop=stop,
39
+ logprobs=1
40
+ )
41
+ self.batch_size = batch_size
42
+ self.tokenizer = self.llm.get_tokenizer()
43
+ print(f"Initialization complete. Batch size: {batch_size}")
44
+
45
+ def parse_response(self, responses):
46
+ all_outputs = []
47
+ for response in responses:
48
+ to_return = []
49
+ for output in response.outputs:
50
+ text = output.text.strip()
51
+ try:
52
+ logprob = sum(logprob_obj.logprob for item in output.logprobs for logprob_obj in item.values())
53
+ except:
54
+ logprob = 0 # Fallback if logprobs aren't available
55
+ to_return.append((text, logprob))
56
+ texts = [r[0] for r in sorted(to_return, key=lambda tup: tup[1], reverse=True)]
57
+ all_outputs.append(texts)
58
+ return all_outputs
59
+
60
+ def prepare_prompt(self, claim: str, model_name: str) -> str:
61
+ base_prompt = f"Please write a fact-checking article passage to support, refute, indicate not enough evidence, or present conflicting evidence regarding the claim.\nClaim: {claim}"
62
+
63
+ if "OLMo" in model_name:
64
+ return base_prompt
65
+ else:
66
+ messages = [{"role": "user", "content": base_prompt}]
67
+ return self.tokenizer.apply_chat_template(messages, tokenize=False) + "<|start_header_id|>assistant<|end_header_id|>\n\nPassage: "
68
+
69
+ def process_batch(self, batch: List[Dict[str, Any]], model_name: str) -> tuple[List[Dict[str, Any]], float]:
70
+ start_time = time.time()
71
+ prompts = [self.prepare_prompt(example["claim"], model_name) for example in batch]
72
+
73
+ try:
74
+ results = self.llm.generate(prompts, sampling_params=self.sampling_params)
75
+ outputs = self.parse_response(results)
76
+
77
+ for example, output in zip(batch, outputs):
78
+ example['hypo_fc_docs'] = output
79
+
80
+ batch_time = time.time() - start_time
81
+ return batch, batch_time
82
+ except Exception as e:
83
+ print(f"Error processing batch: {str(e)}")
84
+ return batch, time.time() - start_time
85
+
86
+ # def format_time(seconds: float) -> str:
87
+ # return str(timedelta(seconds=int(seconds)))
88
+
89
+ # def estimate_completion_time(start_time: float, processed_examples: int, total_examples: int) -> str:
90
+ # elapsed_time = time.time() - start_time
91
+ # examples_per_second = processed_examples / elapsed_time
92
+ # remaining_examples = total_examples - processed_examples
93
+ # estimated_remaining_seconds = remaining_examples / examples_per_second
94
+ # completion_time = datetime.now() + timedelta(seconds=int(estimated_remaining_seconds))
95
+ # return completion_time.strftime("%Y-%m-%d %H:%M:%S")
96
+
97
+ def main(args):
98
+ total_start_time = time.time()
99
+ print(f"Script started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
100
+
101
+ # Load data
102
+ print("Loading data...")
103
+ with open(args.target_data, 'r', encoding='utf-8') as json_file:
104
+ examples = json.load(json_file)
105
+ print(f"Loaded {len(examples)} examples")
106
+
107
+ # Initialize generator
108
+ print("Initializing generator...")
109
+ generator = VLLMGenerator(
110
+ model_name=args.model,
111
+ batch_size=32
112
+ )
113
+
114
+ # Process data in batches
115
+ processed_data = []
116
+ # batch_times = []
117
+ batches = [examples[i:i + generator.batch_size] for i in range(0, len(examples), generator.batch_size)]
118
+
119
+ print(f"\nProcessing {len(batches)} batches...")
120
+ with tqdm(total=len(examples), desc="Processing examples") as pbar:
121
+ for batch_idx, batch in enumerate(batches, 1):
122
+ processed_batch, batch_time = generator.process_batch(batch, args.model)
123
+ processed_data.extend(processed_batch)
124
+ # batch_times.append(batch_time)
125
+
126
+ # Update progress and timing information
127
+ # examples_processed = len(processed_data)
128
+ # avg_batch_time = sum(batch_times) / len(batch_times)
129
+ # estimated_completion = estimate_completion_time(total_start_time, examples_processed, len(examples))
130
+
131
+ # pbar.set_postfix({
132
+ # 'Batch': f"{batch_idx}/{len(batches)}",
133
+ # 'Avg Batch Time': f"{avg_batch_time:.2f}s",
134
+ # 'ETA': estimated_completion
135
+ # })
136
+ # pbar.update(len(batch))
137
+
138
+ # Calculate and display timing statistics
139
+ # total_time = time.time() - total_start_time
140
+ # avg_batch_time = sum(batch_times) / len(batch_times)
141
+ # avg_example_time = total_time / len(examples)
142
+
143
+ # print("\nTiming Statistics:")
144
+ # print(f"Total Runtime: {format_time(total_time)}")
145
+ # print(f"Average Batch Time: {avg_batch_time:.2f} seconds")
146
+ # print(f"Average Time per Example: {avg_example_time:.2f} seconds")
147
+ # print(f"Throughput: {len(examples)/total_time:.2f} examples/second")
148
+
149
+ # Save results
150
+ # print("\nSaving results...")
151
+ with open(args.json_output, "w", encoding="utf-8") as output_json:
152
+ json.dump(processed_data, output_json, ensure_ascii=False, indent=4)
153
+
154
+ # print(f"Script completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
155
+ # print(f"Total runtime: {format_time(total_time)}")
156
+
157
+ if __name__ == "__main__":
158
+ parser = argparse.ArgumentParser()
159
+ parser.add_argument('-i', '--target_data', default='data_store/averitec/dev.json')
160
+ parser.add_argument('-o', '--json_output', default='data_store/hyde_fc.json')
161
+ parser.add_argument('-m', '--model', default="meta-llama/Llama-3.1-8B-Instruct")
162
+ args = parser.parse_args()
163
+ main(args)
system/baseline/question_generation_optimized.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import time
4
+ import json
5
+ import nltk
6
+ from rank_bm25 import BM25Okapi
7
+ import numpy as np
8
+ import torch
9
+ from vllm import LLM, SamplingParams
10
+ from datetime import datetime, timedelta
11
+ from itertools import islice
12
+
13
+
14
+ def download_nltk_data(package_name, download_dir='nltk_data'):
15
+ # Ensure the download directory exists
16
+ os.makedirs(download_dir, exist_ok=True)
17
+
18
+ # Set NLTK data path
19
+ nltk.data.path.append(download_dir)
20
+
21
+ try:
22
+ # Try to find the resource
23
+ nltk.data.find(f'tokenizers/{package_name}')
24
+ print(f"Package '{package_name}' is already downloaded")
25
+ except LookupError:
26
+ # If resource isn't found, download it
27
+ print(f"Downloading {package_name}...")
28
+ nltk.download(package_name, download_dir=download_dir)
29
+ print(f"Successfully downloaded {package_name}")
30
+
31
+ # def format_time(seconds):
32
+ # """Format time duration nicely."""
33
+ # return str(timedelta(seconds=round(seconds)))
34
+
35
+ def claim2prompts(example):
36
+ claim = example["claim"]
37
+ claim_str = "Example [NUMBER]:||Claim: " + claim + "||Evidence: "
38
+
39
+ for question in example["questions"]:
40
+ q_text = question["question"].strip()
41
+ if len(q_text) == 0:
42
+ continue
43
+
44
+ if not q_text[-1] == "?":
45
+ q_text += "?"
46
+
47
+ answer_strings = []
48
+
49
+ for a in question["answers"]:
50
+ if a["answer_type"] in ["Extractive", "Abstractive"]:
51
+ answer_strings.append(a["answer"])
52
+ if a["answer_type"] == "Boolean":
53
+ answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
54
+
55
+ for a_text in answer_strings:
56
+ if not a_text[-1] in [".", "!", ":", "?"]:
57
+ a_text += "."
58
+
59
+ prompt_lookup_str = a_text
60
+ this_q_claim_str = claim_str + a_text.strip() + "||Question: " + q_text
61
+ yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n")[:1500])
62
+
63
+ def main(args):
64
+ # script_start = time.time()
65
+ # start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
66
+ # print(f"Script started at: {start_time}")
67
+ # print(f"Loading model: {args.model}")
68
+
69
+
70
+ download_nltk_data('punkt')
71
+ download_nltk_data('punkt_tab')
72
+
73
+ # Load and prepare reference corpus
74
+ # corpus_start = time.time()
75
+ with open(args.reference_corpus, "r", encoding="utf-8") as json_file:
76
+ train_examples = json.load(json_file)
77
+
78
+ prompt_corpus, tokenized_corpus = [], []
79
+ for example in train_examples:
80
+ for lookup_str, prompt in claim2prompts(example):
81
+ entry = nltk.word_tokenize(lookup_str)
82
+ tokenized_corpus.append(entry)
83
+ prompt_corpus.append(prompt)
84
+
85
+ prompt_bm25 = BM25Okapi(tokenized_corpus)
86
+ # print(f"Reference corpus processed in: {format_time(time.time() - corpus_start)}")
87
+
88
+ # Initialize vLLM with optimized settings
89
+ gpu_count = torch.cuda.device_count()
90
+ print(f"Using {gpu_count} GPU{'s' if gpu_count > 1 else ''}")
91
+
92
+ # model_start = time.time()
93
+ llm = LLM(
94
+ model=args.model,
95
+ tensor_parallel_size=gpu_count,
96
+ max_model_len=4096,
97
+ gpu_memory_utilization=0.95,
98
+ enforce_eager=True,
99
+ trust_remote_code=True,
100
+ # dtype="half",
101
+ )
102
+ llm.get_tokenizer().pad_token = "<|end_of_text|>"
103
+ # print(f"Model loaded in: {format_time(time.time() - model_start)}")
104
+
105
+ sampling_params = SamplingParams(
106
+ temperature=0.6,
107
+ top_p=0.9,
108
+ top_k=1,
109
+ skip_special_tokens=False,
110
+ max_tokens=512,
111
+ stop=['<|end_of_text|>', '</s>', '<|im_end|>', '[INST]', '[/INST]','<|eot_id|>','<|end|>','<|endoftext|>']
112
+ )
113
+
114
+ # processing_start = time.time()
115
+
116
+ # Load target data
117
+ target_examples = []
118
+ with open(args.top_k_target_knowledge, "r", encoding="utf-8") as json_file:
119
+ for line in json_file:
120
+ target_examples.append(json.loads(line))
121
+
122
+ if args.end == -1:
123
+ args.end = len(target_examples)
124
+ print(f"Processing {args.end} examples")
125
+
126
+ # Process in batches
127
+ with torch.no_grad():
128
+ with open(args.output_questions, "w", encoding="utf-8") as output_file:
129
+ for idx in range(0, args.end, args.batch_size):
130
+ batch_end = min(idx + args.batch_size, args.end)
131
+ current_batch = target_examples[idx:batch_end]
132
+ print(f"\nProcessing batch {idx}-{batch_end}...")
133
+
134
+ for example in current_batch:
135
+ # batch_start = time.time()
136
+ claim = example["claim"]
137
+ claim_id = example["claim_id"]
138
+ top_k_sentences_urls = example[f"top_{args.top_k}"]
139
+
140
+ batch_prompts = []
141
+ batch_metadata = []
142
+
143
+ # Prepare all prompts for current example
144
+ for sentences_urls in top_k_sentences_urls:
145
+ prompt_lookup_str = sentences_urls["sentence"]
146
+ url = sentences_urls["url"]
147
+
148
+ prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
149
+ prompt_n = 10
150
+ prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
151
+ prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
152
+
153
+ temp_prompt = "\n\n".join(prompt_docs)
154
+ for k in range(1, temp_prompt.count("[NUMBER]")+1):
155
+ temp_prompt = temp_prompt.replace("[NUMBER]", f"{k}", 1)
156
+
157
+ claim_prompt = "Your task is to generate a question based on the given claim and evidence. The question should clarify the relationship between the evidence and the claim\n\n"
158
+ evidence = prompt_lookup_str.replace("\n", " ")
159
+ full_prompt = claim_prompt + temp_prompt + "\n\nNow, generate a question that links the following claim and evidence:" + f"\n\nClaim: {claim}" + f"\nEvidence: {evidence}"
160
+
161
+ if "OLMo" in args.model:
162
+ inputs = [full_prompt]
163
+ else:
164
+ messages = [{"role":"user", "content":full_prompt}]
165
+ inputs = llm.get_tokenizer().apply_chat_template(messages, tokenize=False)
166
+ inputs += "<|start_header_id|>assistant<|end_header_id|>\n\nQuestion: "
167
+
168
+ batch_prompts.append(inputs)
169
+ batch_metadata.append((url, prompt_lookup_str))
170
+
171
+ # Process batch
172
+ outputs = llm.generate(batch_prompts, sampling_params)
173
+
174
+ # Process outputs
175
+ evidence = []
176
+ for output, (url, sent) in zip(outputs, batch_metadata):
177
+ question = output.outputs[0].text.strip().split("?")[0].replace("\n", " ") + "?"
178
+ evidence.append({
179
+ "question": question,
180
+ "answer": sent,
181
+ "url": url
182
+ })
183
+
184
+ # Write results
185
+ json_data = {
186
+ "claim_id": claim_id,
187
+ "claim": claim,
188
+ "evidence": evidence
189
+ }
190
+ output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
191
+ output_file.flush()
192
+
193
+ # batch_time = time.time() - batch_start
194
+ # print(f"Processed example {claim_id}. Time elapsed: {batch_time:.2f}s")
195
+
196
+ # Calculate and display timing information
197
+ # total_time = time.time() - script_start
198
+ # processing_time = time.time() - processing_start
199
+ # end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
200
+
201
+ # print("\nTiming Summary:")
202
+ # print(f"Start time: {start_time}")
203
+ # print(f"End time: {end_time}")
204
+ # print(f"Total runtime: {format_time(total_time)}")
205
+ # print(f"Setup time: {format_time(processing_start - script_start)}")
206
+ # print(f"Processing time: {format_time(processing_time)}")
207
+ # print(f"Results written to: {args.output_questions}")
208
+
209
+ if __name__ == "__main__":
210
+ parser = argparse.ArgumentParser(description="Use a prompt to generate questions that could be answered by top-k retrieved evidence. Output generated questions.")
211
+ parser.add_argument("--model", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
212
+ parser.add_argument("--reference_corpus", default="baseline/train.json")
213
+ parser.add_argument(
214
+ "-i",
215
+ "--top_k_target_knowledge",
216
+ default="data_store/dev_reranking_top_k.json",
217
+ help="Directory where the sentences for the scraped data is saved.",
218
+ )
219
+ parser.add_argument(
220
+ "-o",
221
+ "--output_questions",
222
+ default="data_store/dev_top_k_qa.json",
223
+ help="Directory where the sentences for the scraped data is saved.",
224
+ )
225
+ parser.add_argument(
226
+ "--top_k",
227
+ default=10,
228
+ type=int
229
+ )
230
+ parser.add_argument(
231
+ "--batch_size",
232
+ type=int,
233
+ default=4,
234
+ help="Number of examples to process in each batch"
235
+ )
236
+ parser.add_argument(
237
+ "-e",
238
+ "--end",
239
+ type=int,
240
+ default=-1
241
+ )
242
+
243
+ args = parser.parse_args()
244
+ main(args)
system/baseline/reranking_optimized.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gc
4
+ from transformers import AutoModel, AutoTokenizer
5
+ from sentence_transformers import SentenceTransformer
6
+ import numpy as np
7
+ import json
8
+ import argparse
9
+ import time
10
+ from datetime import datetime, timedelta
11
+ import re
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+
15
+ def encode_text(model, tokenizer, texts, batch_size=8, max_length=512):
16
+ """Encode texts to embeddings using AutoModel"""
17
+ all_embeddings = []
18
+
19
+ for i in range(0, len(texts), batch_size):
20
+ batch = texts[i:i + batch_size]
21
+
22
+ # Tokenize
23
+ encoded_input = tokenizer(
24
+ batch,
25
+ padding=True,
26
+ truncation=True,
27
+ max_length=max_length,
28
+ return_tensors='pt'
29
+ ).to(model.device)
30
+
31
+ # Compute token embeddings
32
+ with torch.no_grad():
33
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
34
+ model_output = model(**encoded_input)
35
+ # Use mean pooling
36
+ attention_mask = encoded_input['attention_mask']
37
+ token_embeddings = model_output[0] # First element contains token embeddings
38
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
39
+ embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
40
+ all_embeddings.append(embeddings.cpu().numpy())
41
+
42
+ # Clear some memory
43
+ if i % (batch_size * 4) == 0:
44
+ torch.cuda.empty_cache()
45
+ gc.collect()
46
+
47
+ return np.vstack(all_embeddings)
48
+
49
+ def compute_similarity(emb1, emb2):
50
+ """Compute cosine similarity between embeddings"""
51
+ return np.dot(emb1, emb2.T) / (
52
+ np.linalg.norm(emb1, axis=1).reshape(-1, 1) *
53
+ np.linalg.norm(emb2, axis=1).reshape(1, -1)
54
+ )
55
+
56
+ def get_detailed_instruct(task_description: str, query: str) -> str:
57
+ return f'Instruct: {task_description}\nQuery: {query}'
58
+
59
+ def preprocess_sentences(sentence1, sentence2):
60
+ vectorizer = TfidfVectorizer().fit_transform([sentence1, sentence2])
61
+ vectors = vectorizer.toarray()
62
+
63
+ cosine_sim = cosine_similarity(vectors)
64
+ similarity_score = cosine_sim[0][1]
65
+ return similarity_score
66
+
67
+ def remove_trailing_special_chars(text):
68
+ return re.sub(r'[\W_]+$', '', text)
69
+
70
+ def remove_special_chars_except_spaces(text):
71
+ return re.sub(r'[^\w\s]+', '', text)
72
+
73
+ def select_top_k(claim, results, top_k):
74
+ '''
75
+ remove sentence of similarity claim
76
+ '''
77
+ dup_check = set()
78
+ top_k_sentences_urls = []
79
+
80
+ i = 0
81
+ print(results)
82
+ claim = remove_special_chars_except_spaces(claim).lower()
83
+ while len(top_k_sentences_urls) < top_k and i < len(results):
84
+ print(i)
85
+ sentence = remove_special_chars_except_spaces(results[i]['sentence']).lower()
86
+
87
+ if sentence not in dup_check:
88
+ if preprocess_sentences(claim, sentence) > 0.97:
89
+ dup_check.add(sentence)
90
+ continue
91
+
92
+ if claim in sentence:
93
+ if len(claim) / len(sentence) > 0.92:
94
+ dup_check.add(sentence)
95
+ continue
96
+
97
+ top_k_sentences_urls.append({
98
+ 'sentence': results[i]['sentence'],
99
+ 'url': results[i]['url']}
100
+ )
101
+ i += 1
102
+
103
+ return top_k_sentences_urls
104
+
105
+ # def format_time(seconds):
106
+ # """Format time duration nicely."""
107
+ # return str(timedelta(seconds=round(seconds)))
108
+
109
+
110
+ def compute_embeddings_batched(model, texts, batch_size=8):
111
+ """Compute embeddings in smaller batches to manage memory"""
112
+ all_embeddings = []
113
+ for i in range(0, len(texts), batch_size):
114
+ batch = texts[i:i + batch_size]
115
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16): # Use bfloat16
116
+ emb = model.encode(batch, batch_size=len(batch), show_progress_bar=False)
117
+ all_embeddings.append(emb)
118
+
119
+ # Clear some memory
120
+ if i % (batch_size * 4) == 0:
121
+ torch.cuda.empty_cache()
122
+ gc.collect()
123
+
124
+ return np.vstack(all_embeddings)
125
+
126
+ def main(args):
127
+
128
+
129
+ device = "cuda" if torch.cuda.is_available() else 'cpu'
130
+ print(f"Using device: {device}")
131
+
132
+ # Load model and tokenizer
133
+ model = AutoModel.from_pretrained(
134
+ "Salesforce/SFR-Embedding-2_R",
135
+ torch_dtype=torch.bfloat16,
136
+ low_cpu_mem_usage=True,
137
+ device_map="auto"
138
+ )
139
+ tokenizer = AutoTokenizer.from_pretrained("Salesforce/SFR-Embedding-2_R")
140
+
141
+ # Load target examples
142
+ target_examples = []
143
+ with open(args.target_data, "r", encoding="utf-8") as json_file:
144
+ for i, line in enumerate(json_file):
145
+ try:
146
+ example = json.loads(r"{}".format(line))
147
+ target_examples.append(example)
148
+ except:
149
+ print(f"CURRENT LINE broken {i}")
150
+
151
+ if args.end == -1:
152
+ args.end = len(target_examples)
153
+
154
+ files_to_process = list(range(args.start, args.end))
155
+ total = len(files_to_process)
156
+
157
+ task = 'Given a web search query, retrieve relevant passages that answer the query'
158
+
159
+ with open(args.json_output, "w", encoding="utf-8") as output_json:
160
+ done = 0
161
+ for idx, example in enumerate(target_examples):
162
+ if idx in files_to_process:
163
+ print(f"Processing claim {example['claim_id']}... Progress: {done + 1} / {total}")
164
+
165
+ claim = example['claim']
166
+ query = [get_detailed_instruct(task, claim)] + [
167
+ get_detailed_instruct(task, le)
168
+ for le in example['hypo_fc_docs']
169
+ if len(le.strip()) > 0
170
+ ]
171
+ query_length = len(query)
172
+ sentences = [sent['sentence'] for sent in example[f'top_{5000}']][:args.retrieved_top_k]
173
+
174
+ # st = time.time()
175
+ try:
176
+ # Process query embeddings
177
+ query_embeddings = encode_text(model, tokenizer, query, batch_size=4)
178
+ avg_emb_q = np.mean(query_embeddings, axis=0)
179
+ hyde_vector = avg_emb_q.reshape((1, -1))
180
+
181
+ # Process sentence embeddings in smaller chunks
182
+ sentence_embeddings = encode_text(
183
+ model,
184
+ tokenizer,
185
+ sentences,
186
+ batch_size=args.batch_size
187
+ )
188
+
189
+ # Compute similarities in chunks to save memory
190
+ chunk_size = 1000
191
+ all_scores = []
192
+ for i in range(0, len(sentence_embeddings), chunk_size):
193
+ chunk = sentence_embeddings[i:i + chunk_size]
194
+ chunk_scores = compute_similarity(hyde_vector, chunk)[0]
195
+ all_scores.extend(chunk_scores)
196
+
197
+ scores = np.array(all_scores)
198
+ top_k_idx = np.argsort(scores)[::-1]
199
+ results = [example['top_5000'][i] for i in top_k_idx]
200
+ top_k_sentences_urls = select_top_k(claim, results, args.top_k)
201
+
202
+ # print(f"Top {args.top_k} retrieved. Time elapsed: {time.time() - st:.2f}s")
203
+
204
+ json_data = {
205
+ "claim_id": example['claim_id'],
206
+ "claim": claim,
207
+ f"top_{args.top_k}": top_k_sentences_urls
208
+ }
209
+ output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
210
+ output_json.flush()
211
+
212
+ except RuntimeError as e:
213
+ print(f"Error processing claim {example['claim_id']}: {e}")
214
+ continue
215
+
216
+ done += 1
217
+
218
+
219
+ if __name__ == "__main__":
220
+ parser = argparse.ArgumentParser()
221
+ parser.add_argument("--target_data", default="data_store/dev_retrieval_top_k.json")
222
+ parser.add_argument("--retrieved_top_k", type=int, default=5000)
223
+ parser.add_argument("--top_k", type=int, default=10)
224
+ parser.add_argument("-o", "--json_output", type=str, default="data_store/dev_reranking_top_k.json")
225
+ parser.add_argument("--batch_size", type=int, default=32)
226
+ parser.add_argument("-s", "--start", type=int, default=0)
227
+ parser.add_argument("-e", "--end", type=int, default=-1)
228
+ args = parser.parse_args()
229
+
230
+ main(args)
system/baseline/retrieval_optimized.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import time
5
+ import numpy as np
6
+ import pandas as pd
7
+ import nltk
8
+ from rank_bm25 import BM25Okapi
9
+ from multiprocessing import Pool, cpu_count, Manager, Lock
10
+ from functools import partial
11
+ import heapq
12
+ from threading import Thread, Event
13
+ import queue
14
+ from datetime import datetime, timedelta
15
+
16
+
17
+ def download_nltk_data(package_name, download_dir='nltk_data'):
18
+ # Ensure the download directory exists
19
+ os.makedirs(download_dir, exist_ok=True)
20
+
21
+ # Set NLTK data path
22
+ nltk.data.path.append(download_dir)
23
+
24
+ try:
25
+ # Try to find the resource
26
+ nltk.data.find(f'tokenizers/{package_name}')
27
+ print(f"Package '{package_name}' is already downloaded")
28
+ except LookupError:
29
+ # If resource isn't found, download it
30
+ print(f"Downloading {package_name}...")
31
+ nltk.download(package_name, download_dir=download_dir)
32
+ print(f"Successfully downloaded {package_name}")
33
+
34
+
35
+ def combine_all_sentences(knowledge_file):
36
+ sentences, urls = [], []
37
+
38
+ with open(knowledge_file, "r", encoding="utf-8") as json_file:
39
+ for i, line in enumerate(json_file):
40
+ data = json.loads(line)
41
+ sentences.extend(data["url2text"])
42
+ urls.extend([data["url"] for _ in range(len(data["url2text"]))])
43
+ return sentences, urls, i + 1
44
+
45
+ def remove_duplicates(sentences, urls):
46
+ df = pd.DataFrame({"document_in_sentences":sentences, "sentence_urls":urls})
47
+ df['sentences'] = df['document_in_sentences'].str.strip().str.lower()
48
+ df = df.drop_duplicates(subset="sentences").reset_index()
49
+ return df['document_in_sentences'].tolist(), df['sentence_urls'].tolist()
50
+
51
+ def retrieve_top_k_sentences(query, document, urls, top_k):
52
+ tokenized_docs = [nltk.word_tokenize(doc) for doc in document[:top_k]]
53
+ bm25 = BM25Okapi(tokenized_docs)
54
+
55
+ scores = bm25.get_scores(nltk.word_tokenize(query))
56
+ top_k_idx = np.argsort(scores)[::-1][:top_k]
57
+
58
+ return [document[i] for i in top_k_idx], [urls[i] for i in top_k_idx]
59
+
60
+ def process_single_example(idx, example, args, result_queue, counter, lock):
61
+ try:
62
+ with lock:
63
+ current_count = counter.value + 1
64
+ counter.value = current_count
65
+ print(f"\nProcessing claim {idx}... Progress: {current_count} / {args.total_examples}")
66
+
67
+ # start_time = time.time()
68
+
69
+ document_in_sentences, sentence_urls, num_urls_this_claim = combine_all_sentences(
70
+ os.path.join(args.knowledge_store_dir, f"{idx}.jsonl")
71
+ )
72
+
73
+ print(f"Obtained {len(document_in_sentences)} sentences from {num_urls_this_claim} urls.")
74
+
75
+ document_in_sentences, sentence_urls = remove_duplicates(document_in_sentences, sentence_urls)
76
+
77
+ query = example["claim"] + " " + " ".join(example['hypo_fc_docs'])
78
+ top_k_sentences, top_k_urls = retrieve_top_k_sentences(
79
+ query, document_in_sentences, sentence_urls, args.top_k
80
+ )
81
+
82
+
83
+ result = {
84
+ "claim_id": idx,
85
+ "claim": example["claim"],
86
+ f"top_{args.top_k}": [
87
+ {"sentence": sent, "url": url}
88
+ for sent, url in zip(top_k_sentences, top_k_urls)
89
+ ],
90
+ "hypo_fc_docs": example['hypo_fc_docs']
91
+ }
92
+
93
+ result_queue.put((idx, result))
94
+ return True
95
+ except Exception as e:
96
+ print(f"Error processing example {idx}: {str(e)}")
97
+ result_queue.put((idx, None))
98
+ return False
99
+
100
+ def writer_thread(output_file, result_queue, total_examples, stop_event):
101
+ next_index = 0
102
+ pending_results = []
103
+
104
+ with open(output_file, "w", encoding="utf-8") as f:
105
+ while not (stop_event.is_set() and result_queue.empty()):
106
+ try:
107
+ idx, result = result_queue.get(timeout=1)
108
+
109
+ if result is not None:
110
+ heapq.heappush(pending_results, (idx, result))
111
+
112
+ while pending_results and pending_results[0][0] == next_index:
113
+ _, result_to_write = heapq.heappop(pending_results)
114
+ f.write(json.dumps(result_to_write, ensure_ascii=False) + "\n")
115
+ f.flush()
116
+ next_index += 1
117
+
118
+ except queue.Empty:
119
+ continue
120
+
121
+ # def format_time(seconds):
122
+ # """Format time duration nicely."""
123
+ # return str(timedelta(seconds=round(seconds)))
124
+
125
+ def main(args):
126
+
127
+
128
+
129
+ download_nltk_data('punkt')
130
+ download_nltk_data('punkt_tab')
131
+
132
+ with open(args.target_data, "r", encoding="utf-8") as json_file:
133
+ target_examples = json.load(json_file)
134
+
135
+ if args.end == -1:
136
+ args.end = len(target_examples)
137
+
138
+ print(f"Total examples to process: {args.end - args.start}")
139
+
140
+ files_to_process = list(range(args.start, args.end))
141
+ examples_to_process = [(idx, target_examples[idx]) for idx in files_to_process]
142
+
143
+ num_workers = min(args.workers if args.workers > 0 else cpu_count(), len(files_to_process))
144
+ print(f"Using {num_workers} workers to process {len(files_to_process)} examples")
145
+
146
+ with Manager() as manager:
147
+ counter = manager.Value('i', 0)
148
+ lock = manager.Lock()
149
+ args.total_examples = len(files_to_process)
150
+
151
+ result_queue = manager.Queue()
152
+
153
+ stop_event = Event()
154
+ writer = Thread(
155
+ target=writer_thread,
156
+ args=(args.json_output, result_queue, len(files_to_process), stop_event)
157
+ )
158
+ writer.start()
159
+
160
+ process_func = partial(
161
+ process_single_example,
162
+ args=args,
163
+ result_queue=result_queue,
164
+ counter=counter,
165
+ lock=lock
166
+ )
167
+
168
+ with Pool(num_workers) as pool:
169
+ results = pool.starmap(process_func, examples_to_process)
170
+
171
+ stop_event.set()
172
+ writer.join()
173
+
174
+ # successful = sum(1 for r in results if r)
175
+ # print(f"\nSuccessfully processed {successful} out of {len(files_to_process)} examples")
176
+ # print(f"Results written to {args.json_output}")
177
+
178
+ # # Calculate and display timing information
179
+ # total_time = time.time() - script_start
180
+ # avg_time = total_time / len(files_to_process)
181
+ # end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
182
+
183
+ # print("\nTiming Summary:")
184
+ # print(f"Start time: {start_time}")
185
+ # print(f"End time: {end_time}")
186
+ # print(f"Total runtime: {format_time(total_time)} (HH:MM:SS)")
187
+ # print(f"Average time per example: {avg_time:.2f} seconds")
188
+ # if successful > 0:
189
+ # print(f"Processing speed: {successful / total_time:.2f} examples per second")
190
+
191
+ if __name__ == "__main__":
192
+ parser = argparse.ArgumentParser(
193
+ description="Get top 10000 sentences with BM25 in the knowledge store using parallel processing."
194
+ )
195
+ parser.add_argument(
196
+ "-k",
197
+ "--knowledge_store_dir",
198
+ type=str,
199
+ default="data_store/knowledge_store",
200
+ help="The path of the knowledge_store_dir containing json files with all the retrieved sentences.",
201
+ )
202
+ parser.add_argument(
203
+ "--target_data",
204
+ type=str,
205
+ default="data_store/hyde_fc.json",
206
+ help="The path of the file that stores the claim.",
207
+ )
208
+ parser.add_argument(
209
+ "-o",
210
+ "--json_output",
211
+ type=str,
212
+ default="data_store/dev_retrieval_top_k.json",
213
+ help="The output dir for JSON files to save the top 100 sentences for each claim.",
214
+ )
215
+ parser.add_argument(
216
+ "--top_k",
217
+ default=5000,
218
+ type=int,
219
+ help="How many documents should we pick out with BM25.",
220
+ )
221
+ parser.add_argument(
222
+ "-s",
223
+ "--start",
224
+ type=int,
225
+ default=0,
226
+ help="Starting index of the files to process.",
227
+ )
228
+ parser.add_argument(
229
+ "-e",
230
+ "--end",
231
+ type=int,
232
+ default=-1,
233
+ help="End index of the files to process.",
234
+ )
235
+ parser.add_argument(
236
+ "-w",
237
+ "--workers",
238
+ type=int,
239
+ default=0,
240
+ help="Number of worker processes (default: number of CPU cores)",
241
+ )
242
+
243
+ args = parser.parse_args()
244
+ main(args)
system/baseline/train.json ADDED
The diff for this file is too large to render. See raw diff
 
system/ee.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import argparse
4
+ from tqdm import tqdm
5
+ import tiktoken
6
+ from openai import OpenAI
7
+
8
+ def gpt_4o(input_text):
9
+ client=OpenAI(api_key=os.environ.get("OAI"))
10
+ response = client.chat.completions.create(
11
+ model="gpt-4o",
12
+ messages=[
13
+ {"role": "user", "content": [{"type": "text", "text": input_text}]}
14
+ ],
15
+ response_format={"type": "json_object"},
16
+ temperature=0,
17
+ max_tokens=4096,
18
+ top_p=0,
19
+ frequency_penalty=0,
20
+ presence_penalty=0
21
+ )
22
+ return response.choices[0].message.content
23
+
24
+ def run_gpt4_event_extraction(data_dir, icl_path, max_tokens=100000):
25
+
26
+ all_info_path = os.path.join(data_dir, "all_info_with_txt.json")
27
+ output_dir = os.path.join(data_dir, "gpt4_event_extraction")
28
+ os.makedirs(output_dir, exist_ok=True)
29
+
30
+ ICL = open(icl_path, "r").read()
31
+ all_info = open(all_info_path, "r").readlines()
32
+
33
+ enc = tiktoken.encoding_for_model("gpt-4o")
34
+
35
+ for i, line in enumerate(all_info):
36
+ ID = i
37
+ urls = []
38
+ results = []
39
+
40
+ data = json.loads(line)
41
+ docs = data["evidence"]
42
+ claim = data["claim"]
43
+
44
+ output_path = os.path.join(output_dir, f"gpt4o_results_{ID}_claim.json")
45
+ if os.path.exists(output_path):
46
+ print(f"输出已存在 {output_path}")
47
+
48
+ else:
49
+
50
+ for doc in tqdm(docs):
51
+ if doc["url"] in urls:
52
+ continue
53
+
54
+ text = " ".join(doc["text"])
55
+ input_text = (
56
+ f"{ICL}\nInput:\n\nTitle: {doc['metadata']['title']}\n"
57
+ f"Date: {doc['metadata']['date']}\nArticle: {text}\n\n"
58
+ f"Please only summarize events that are useful for verifying the claim '{claim}', and their dates in the JSON format.\n\nOutput:\n"
59
+ )
60
+
61
+ urls.append(doc["url"])
62
+ text_tokens = enc.encode(input_text)
63
+ if len(text_tokens) > max_tokens:
64
+ input_text = enc.decode(text_tokens[:max_tokens])
65
+
66
+ try:
67
+ output = gpt_4o(input_text)
68
+ print(f"GPT-4o Response: {output}")
69
+ results.append({
70
+ "url": doc["url"],
71
+ "title": doc["metadata"]["title"],
72
+ "date": doc["metadata"]["date"],
73
+ "article": text,
74
+ "output": json.loads(output)
75
+ })
76
+ except Exception as e:
77
+ print(f"Error processing doc: {e}")
78
+ continue
79
+
80
+
81
+ with open(output_path, "w", encoding="utf-8") as f:
82
+ json.dump(results, f, ensure_ascii=False, indent=4)
83
+
84
+ return output_path
85
+
86
+ if __name__ == "__main__":
87
+ parser = argparse.ArgumentParser(description="Run GPT-4o event extraction")
88
+ parser.add_argument("--data_dir", type=str, required=True, help="Root data directory")
89
+ parser.add_argument("--icl_path", type=str, required=True, help="Path to ICL prompt file")
90
+ parser.add_argument("--max_tokens", type=int, default=100000, help="Maximum token limit for input")
91
+
92
+ args = parser.parse_args()
93
+
94
+ run_gpt4_event_extraction(
95
+ base_dir=args.base_dir,
96
+ icl_path=args.icl_path,
97
+ max_tokens=args.max_tokens
98
+ )
system/existing_pledges.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Take back our streets by halving serious violent crime
2
+ We will support families with children by introducing free breakfast clubs in every primary school
3
+ We will finally deliver a full trans-inclusive ban on conversion practices
4
+ We will introduce a ‘Hillsborough Law’ which will place a legal duty of candour on public servants and authorities, and provide legal aid for victims of disasters or state-related deaths
5
+ As a first step, in England, we will deliver an extra two million NHS operations, scans, and appointments every year; that is 40,000 more appointments every week
6
+ We will end the use of offshore trusts to avoid inheritance tax so that everyone who makes their home here in the UK pays their taxes here
7
+ We will abolish non-dom status once and for all, replacing it with a modern scheme for people genuinely in the country for a short period
8
+ We will end the VAT exemption and business rates relief for private schools
9
+ We will get Britain building again … with 1.5 million new homes over the next parliament
10
+ We will ensure the next generation can never legally buy cigarettes
11
+ We will not increase taxes on working people
12
+ We will not increase taxes on working people, which is why we will not increase National Insurance
13
+ We will not increase taxes on working people, which is why we will not increase [...] the basic, higher, or additional rates of Income Tax
14
+ We will not increase taxes on working people, which is why we will not increase [...] VAT
15
+ We will intervene earlier to stop young people being drawn into crime, creating a new Young Futures programme with a network of hubs reaching every community
16
+ Raising confidence in the … criminal justice system to its highest levels
17
+ We will recruit an additional 8,500 new staff to treat children and adults through our first term
18
+ Kickstart economic growth to secure the highest sustained growth in the G7
19
+ Raising confidence in the police … to its highest levels
20
+ We will [introduce] new Respect Orders - powers to ban persistent adult offenders from town centres, which will stamp out issues such as public drinking and drug use
21
+ We will create a new Border Security Command, with hundreds of new investigators, intelligence officers, and cross-border police officers
22
+ Capitalised with £7.3 billion over the course of the next Parliament, the National Wealth Fund will have a remit to support We’s growth and clean energy missions
23
+ We will introduce a new participation requirement [for House of Lords members]
24
+ The next We government will therefore bring about an immediate modernisation, by introducing legislation to remove the right of hereditary peers to sit and vote in the House of Lords
25
+ A new Energy Independence Act will establish the framework for We’s energy and climate policies
26
+ We will introduce a Football Governance Bill, which will establish an independent regulator to ensure financial sustainability of football clubs in England
27
+ Every fiscal event making significant changes to taxation or spending will be subject to an independent OBR forecast
28
+ We will establish a National Wealth Fund
29
+ We will conduct a Strategic Defence Review within our first year in government
30
+ Ending the wasteful Migration and Economic Development partnership with Rwanda
31
+ We will cap corporation tax at the current level of 25%, the lowest in the G7, for the entire parliament
32
+ We will introduce a new ‘Fit For the Future’ fund to double the number of CT and MRI scanners, allowing the NHS to catch cancer and other conditions earlier, saving lives
33
+ We will … [give] 16- and 17-year-olds the right to vote in all elections
34
+ We will set up a new returns and enforcement unit, with an additional 1,000 staff, to fast-track removals to safe countries for people who do not have the right to stay here
35
+ We will capitalise Great British Energy with £8.3 billion, over the next parliament
36
+ We will immediately update the National Policy Planning Framework [sic] to undo damaging Conservative changes, including restoring mandatory housing targets
37
+ Recruit 6,500 new expert teachers in key subjects
38
+ We will carry out a review of sentencing to ensure it is brought up to date
39
+ We will train thousands more GPs
40
+ We will return to meeting NHS performance standards. That means patients should expect to wait no longer than 18 weeks from referral for consultant-led treatment of non-urgent health conditions
41
+ Productivity growth in every part of the country
42
+ The government will deliver a milestone of higher living standards in every part of the United Kingdom by the end of the Parliament
43
+ Giving children the best start in life, with a record 75% of 5-year-olds in England ready to learn when they start school
44
+ We will fix an additional one million potholes across England in each year of the next parliament
45
+ We will not grant new coal licences
46
+ We will set out the path to spending 2.5 per cent of GDP on defence
47
+ We will [...] address the inconsistencies in voter ID rules that prevent legitimate voters from voting. For example, in the case of HM Armed Forces Veteran Cards
48
+ We will also introduce a mandatory retirement age. At the end of the Parliament in which a member reaches 80 years of age, they will be required to retire from the House of Lords
49
+ We will create a new publicly-owned company, Great British Energy
50
+ We will negotiate additional returns arrangements to speed up returns
51
+ We will not issue new licences to explore new [oil and gas] fields
52
+ We will deliver our commitment to spend 2.5% of GDP on defence, but we will bring it forward so that we reach that level in 2027 and we will maintain that for the rest of this Parliament
53
+ We will tackle the immediate crisis with a rescue plan to provide 700,000 more urgent dental appointments
54
+ We will … end asylum hotels, saving the taxpayer billions of pounds
system/generate_output.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import argparse
4
+ from system.html2lines import html2metadata
5
+ from lxml.etree import tostring
6
+ import lxml.etree
7
+
8
+ def process_manifesto_data_with_metadata(input_base_dir: str):
9
+
10
+ input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json")
11
+ output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json")
12
+
13
+ url2text_dir = os.path.join(input_base_dir, "augmented_data_store")
14
+
15
+ with open(input_file_path, "r", encoding="utf-8") as f:
16
+ input_file = f.readlines()
17
+
18
+ out_file = open(output_file_path, "w", encoding="utf-8")
19
+
20
+
21
+ i = 0
22
+
23
+ for id, line in enumerate(input_file):
24
+ line = json.loads(line)
25
+ claim = line["claim"]
26
+ QAs = line["top_50"]
27
+ new_line = {"claim": claim, "evidence": []}
28
+
29
+ json_path = os.path.join(url2text_dir, f"{id}.jsonl")
30
+ if not os.path.exists(json_path):
31
+ print(f"Warning: {json_path} not found")
32
+ continue
33
+
34
+ with open(json_path, "r", encoding="utf-8") as f:
35
+ try:
36
+ data_store = json.load(f)
37
+ except json.JSONDecodeError:
38
+ f.seek(0)
39
+ data_store = [json.loads(line) for line in f]
40
+
41
+ url_txt = {data["url"]: data["url2text"] for data in data_store}
42
+
43
+ URLs = []
44
+ for j, QA in enumerate(QAs):
45
+ newQA = QA.copy()
46
+ URL = QA["url"]
47
+ newQA["text"] = url_txt.get(URL, "")
48
+
49
+ if URL not in URLs:
50
+ try:
51
+ meta = html2metadata(URL)
52
+ if isinstance(meta, lxml.etree._Element):
53
+ meta = tostring(meta, encoding="unicode", pretty_print=True)
54
+ meta_save = {
55
+ "title": meta["title"],
56
+ "date": meta["date"]
57
+ }
58
+ except Exception as e:
59
+ print(f"Metadata extraction failed for URL: {URL}, error: {e}")
60
+ meta_save = {
61
+ "title": "",
62
+ "date": ""
63
+ }
64
+
65
+
66
+ newQA["metadata"] = meta_save
67
+ new_line["evidence"].append(newQA)
68
+
69
+ out_file.write(json.dumps(new_line) + "\n")
70
+
71
+ out_file.close()
72
+ return output_file_path
73
+
74
+
75
+
system/hero_QA.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ import subprocess
4
+
5
+
6
+ def run_hero_reranking(user_id, end_date):
7
+ base_dir = f"outputs/{user_id}_{end_date}"
8
+ hero_dir = os.path.join(base_dir, "hero")
9
+ os.makedirs(hero_dir, exist_ok=True)
10
+
11
+ hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
12
+
13
+ def safe_run(cmd, timeout=600):
14
+ try:
15
+ print(f"👉 Running: {' '.join(cmd)}")
16
+ subprocess.run(cmd, check=True, timeout=timeout)
17
+ except subprocess.CalledProcessError as e:
18
+ print(f"[❌ ERROR] Subprocess failed: {e}")
19
+ if e.stderr:
20
+ print("[stderr]:", e.stderr.decode())
21
+ raise
22
+ except subprocess.TimeoutExpired:
23
+ print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
24
+ raise
25
+
26
+ # Step 3.2: retrieval
27
+ print("🔍 Step 3.2: Retrieval from knowledge store ...")
28
+ knowledge_store_dir = os.path.join(base_dir, "augmented_data_store")
29
+ retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k_QA.json")
30
+
31
+ if not os.path.exists(retrieval_output):
32
+ safe_run([
33
+ "python3.12", "baseline/retrieval_optimized.py",
34
+ "--knowledge_store_dir", knowledge_store_dir,
35
+ "--target_data", hyde_output,
36
+ "--json_output", retrieval_output
37
+ ])
38
+
39
+ # Step 3.3: reranking
40
+ print("🏷️ Step 3.3: Reranking retrieved evidence ...")
41
+ rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k_QA.json")
42
+
43
+ if not os.path.exists(rerank_output):
44
+ safe_run([
45
+ "python3.12", "baseline/reranking_optimized.py",
46
+ "--target_data", retrieval_output,
47
+ "--json_output", rerank_output
48
+ ])
49
+
50
+ return {
51
+ "hyde": hyde_output,
52
+ "retrieved": retrieval_output,
53
+ "reranked": rerank_output,
54
+ }
55
+
56
+
57
+ if __name__ == "__main__":
58
+ output_files = run_step3_hero_pipeline(user_id="xxx", end_date="20250604")
59
+ for key, path in output_files.items():
60
+ print(f"✅ {key}: {path}")
system/hero_pipeline.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ import subprocess
4
+ from huggingface_hub import hf_hub_download
5
+ import json
6
+
7
+ def run_hero_reranking(pipeline_base_dir, suggestion_meta):
8
+ base_dir = f"{pipeline_base_dir}"
9
+ hero_dir = os.path.join(base_dir, "hero")
10
+ os.makedirs(hero_dir, exist_ok=True)
11
+
12
+ if suggestion_meta:
13
+ hyde_path = hf_hub_download(
14
+ repo_id="PledgeTracker/demo_feedback",
15
+ filename="manifesto_icl_hyde_fc.json",
16
+ repo_type="dataset",
17
+ token=os.environ["HF_TOKEN"]
18
+ )
19
+ with open(hyde_path, "r", encoding="utf-8") as f:
20
+ all_hyde_data = json.load(f)
21
+
22
+ idx = suggestion_meta["index"]
23
+ single_hyde = [all_hyde_data[idx]]
24
+ save_path = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
25
+ with open(save_path, "w", encoding="utf-8") as f:
26
+ json.dump(single_hyde, f, indent=2)
27
+
28
+ hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
29
+
30
+ def safe_run(cmd, timeout=600):
31
+ try:
32
+ print(f"👉 Running: {' '.join(str(x) for x in cmd)}")
33
+ subprocess.run(cmd, check=True, timeout=timeout)
34
+ except subprocess.CalledProcessError as e:
35
+ print(f"[❌ ERROR] Subprocess failed: {e}")
36
+ if e.stderr:
37
+ print("[stderr]:", e.stderr.decode())
38
+ raise
39
+ except subprocess.TimeoutExpired:
40
+ print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
41
+ raise
42
+
43
+ # Step 3.2: retrieval
44
+ print("🔍 Step 3.2: Retrieval from knowledge store ...")
45
+ knowledge_store_dir = os.path.join(base_dir, "augmented_data_store")
46
+ retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k_QA.json")
47
+
48
+ if not os.path.exists(retrieval_output):
49
+ safe_run([
50
+ "python", "system/baseline/retrieval_optimized.py",
51
+ "--knowledge_store_dir", knowledge_store_dir,
52
+ "--target_data", hyde_output,
53
+ "--json_output", retrieval_output,
54
+ ])
55
+
56
+ # Step 3.3: reranking
57
+ print("🏷️ Step 3.3: Reranking retrieved evidence ...")
58
+ rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k_QA.json")
59
+
60
+ if not os.path.exists(rerank_output):
61
+ safe_run([
62
+ "python", "system/baseline/reranking_optimized.py",
63
+ "--target_data", retrieval_output,
64
+ "--json_output", rerank_output,
65
+ "--top_k", str(50),
66
+ ])
67
+
68
+ return {
69
+ "hyde": hyde_output,
70
+ "retrieved": retrieval_output,
71
+ "reranked": rerank_output,
72
+ }
73
+
74
+
75
+ def run_hero_pipeline(pipeline_base_dir):
76
+ base_dir = f"{pipeline_base_dir}"
77
+ hero_dir = os.path.join(base_dir, "hero")
78
+ os.makedirs(hero_dir, exist_ok=True)
79
+
80
+ target_data = os.path.join(base_dir, "claim.json")
81
+ hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
82
+
83
+ def safe_run(cmd, timeout=600):
84
+ try:
85
+ print(f"👉 Running: {' '.join(cmd)}")
86
+ subprocess.run(cmd, check=True, timeout=timeout)
87
+ except subprocess.CalledProcessError as e:
88
+ print(f"[❌ ERROR] Subprocess failed: {e}")
89
+ if e.stderr:
90
+ print("[stderr]:", e.stderr.decode())
91
+ raise
92
+ except subprocess.TimeoutExpired:
93
+ print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
94
+ raise
95
+
96
+ # Step 3.1: hyde_fc_generation
97
+ if not os.path.exists(hyde_output):
98
+ print("🧠 Step 3.1: HyDE ICL generation ...")
99
+ safe_run([
100
+ "python", "system/baseline/hyde_fc_generation_optimized.py",
101
+ "--target_data", target_data,
102
+ "--json_output", hyde_output
103
+ ])
104
+
105
+ # Step 3.2: retrieval
106
+ print("🔍 Step 3.2: Retrieval from knowledge store ...")
107
+ knowledge_store_dir = os.path.join(base_dir, "initial_data_store")
108
+ retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k.json")
109
+
110
+ if not os.path.exists(retrieval_output):
111
+ safe_run([
112
+ "python", "system/baseline/retrieval_optimized.py",
113
+ "--knowledge_store_dir", knowledge_store_dir,
114
+ "--target_data", hyde_output,
115
+ "--json_output", retrieval_output
116
+ ])
117
+
118
+ # Step 3.3: reranking
119
+ print("🏷️ Step 3.3: Reranking retrieved evidence ...")
120
+ rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k.json")
121
+
122
+ if not os.path.exists(rerank_output):
123
+ safe_run([
124
+ "python", "system/baseline/reranking_optimized.py",
125
+ "--target_data", retrieval_output,
126
+ "--json_output", rerank_output
127
+ ])
128
+
129
+ # Step 3.4: question generation
130
+ print("❓ Step 3.4: Generating QA pairs ...")
131
+ reference_corpus = "system/baseline/train.json"
132
+ qa_output = os.path.join(hero_dir, "manifesto_icl_top_k_qa.json")
133
+
134
+ if not os.path.exists(qa_output):
135
+ safe_run([
136
+ "python", "system/baseline/question_generation_optimized.py",
137
+ "--reference_corpus", reference_corpus,
138
+ "--top_k_target_knowledge", rerank_output,
139
+ "--output_questions", qa_output,
140
+ "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct"
141
+ ])
142
+
143
+ return {
144
+ "hyde": hyde_output,
145
+ "retrieved": retrieval_output,
146
+ "reranked": rerank_output,
147
+ "qa_pairs": qa_output
148
+ }
149
+
150
+
151
+ if __name__ == "__main__":
152
+ user_id="xxx"
153
+ end_date="20250604"
154
+ pipeline_base_dir = f"{user_id}_{end_date}"
155
+ output_files = run_step3_hero_pipeline(pipeline_base_dir)
156
+ for key, path in output_files.items():
157
+ print(f"✅ {key}: {path}")
system/html2lines.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from time import sleep
3
+ import trafilatura
4
+ from trafilatura.meta import reset_caches
5
+ from trafilatura.settings import DEFAULT_CONFIG
6
+ import spacy
7
+ from lxml.etree import tostring
8
+ import lxml.etree
9
+
10
+
11
+ import spacy
12
+ import subprocess
13
+
14
+ try:
15
+ nlp = spacy.load("en_core_web_lg")
16
+ except OSError:
17
+ print("🔁 Downloading spaCy model 'en_core_web_lg' ...")
18
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
19
+ nlp = spacy.load("en_core_web_lg")
20
+
21
+
22
+ DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
23
+ MIN_CHAR = 50
24
+ MAX_CHAR = 5000
25
+
26
+
27
+ def get_page(url):
28
+ page = None
29
+ for _ in range(3):
30
+ try:
31
+ page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
32
+ assert page is not None
33
+ print("Fetched " + url, file=sys.stderr)
34
+ break
35
+ except:
36
+ sleep(3)
37
+ return page
38
+
39
+
40
+ def url2lines(url):
41
+ page = get_page(url)
42
+
43
+ if page is None:
44
+ return []
45
+
46
+ lines = html2lines(page)
47
+ return lines
48
+
49
+
50
+ def line_correction(lines, max_size=100):
51
+ out_lines = []
52
+ for line in lines:
53
+ if len(line) < MIN_CHAR:
54
+ continue
55
+
56
+ if len(line) > max_size:
57
+ doc = nlp(
58
+ line[:MAX_CHAR]
59
+ ) # We split lines into sentences, but for performance we take only the first 5k characters per line
60
+ stack = ""
61
+ for sent in doc.sents:
62
+ if len(stack) > 0:
63
+ stack += " "
64
+ stack += str(sent).strip()
65
+ if len(stack) > max_size:
66
+ out_lines.append(stack)
67
+ stack = ""
68
+
69
+ if (
70
+ len(stack) > MIN_CHAR
71
+ ): # Ensure every lines in the out_lines suffice the MIN_CHAR restriction
72
+ out_lines.append(stack)
73
+ else:
74
+ out_lines.append(line)
75
+
76
+ return out_lines
77
+
78
+
79
+ def html2lines(page):
80
+ out_lines = []
81
+
82
+ if len(page.strip()) == 0 or page is None:
83
+ return out_lines
84
+
85
+ text = trafilatura.extract(page, config=DEFAULT_CONFIG)
86
+ reset_caches()
87
+
88
+ if text is None:
89
+ return out_lines
90
+
91
+ return text.split(
92
+ "\n"
93
+ ) # We just spit out the entire page, so need to reformat later.
94
+
95
+
96
+ def html2metadata(url):
97
+ page = get_page(url)
98
+ metadata = trafilatura.extract_metadata(page)
99
+ return metadata.as_dict()
100
+
101
+ if __name__ == "__main__":
102
+ url = "https://www.bbc.co.uk/news/61407508"
103
+ metadata = html2metadata(url)
104
+ text = " ".join(html2lines(page))
105
+ print(metadata)
system/icl.txt ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Input:
2
+
3
+ Title: New investment for Border Security Command
4
+ Date: 2024-09-17
5
+ Article: Up to £75 million in new investment for the Border Security Command paves way for an autumn immigration crime crackdown. The UK’s Border Security Command will deliver cutting edge new technology, extra officers and further covert capabilities across the system following a significant, immediate cash injection, Home Secretary Yvette Cooper announced today. As part of the new Border Security Command uplift, the National Crime Agency (NCA), the police and other law enforcement agency partners will receive a significant cash injection to bolster the UK’s border security and disrupt the criminal people smuggling gangs. The investment comes ahead of an expected effort by the smuggling gangs to cram ever more vulnerable people into unseaworthy boats launched from the French coast while the weather remains fair. Their industrial scale smuggling business is under sustained pressure from co-ordinated UK and European partner law enforcement action. The Home Secretary announced the package of up to £75 million, which redirects funds originally allocated to the previous government’s Illegal Migration Act. It will unlock sophisticated new technology and extra capabilities for the NCA to bolster UK border security and disrupt the criminal people smuggling gangs. The investment is designed to build on a pattern of successful upstream disruptions announced at an operational summit, attended by the Prime Minister, at the NCA headquarters last week. - covert cameras and state of the art monitoring technology, enhancing evidence collection, speeding up investigations and increasing the likelihood of successful prosecutions - establishing a new unit to improve intelligence collection across UK police forces and information flows to partners, alongside an uplift in prosecutors working in the Crown Prosecution Service to act on investigations to swiftly bring those responsible to justice - recruitment of additional personnel for the new Border Security Command, led by Commander Martin Hewitt, which will oversee the co-operation of all of the organisations involved in smashing the gangs - increased work to tackle organised crime groups facilitating irregular migration upstream by intensifying efforts in transit countries to prevent small boat equipment reaching the French coast The announcement follows yesterday’s meeting between the Prime Minister and his Italian counterpart, Giorgia Meloni, in Rome to discuss systematic bilateral co-operation on border security. Italy has seen a significant drop in irregular migration thanks to tougher enforcement and enhanced cooperation with international partners. Newly appointed Border Security Commander – a director general senior civil servant appointment – Martin Hewitt joined the UK delegation to Rome. The enhanced technical and staffing resources announced today will be an important platform for the work he will co-ordinate across UK law enforcement and intelligence agencies when he formally starts his role in the coming weeks. The funding also covers an additional 100 specialist investigators for the NCA, which was announced by the government last month, representing a 25% increase in the agency’s dedicated personnel tackling organised immigration crime. The government has also announced a 50% increase in the number of British officers stationed at Europol, supporting European operations to dismantle organised crime groups facilitating people smuggling. Criminal gangs are getting away with undermining our border security and putting lives at risk. The Border Security Command will deliver a major overhaul and upgrade in law enforcement against smugglers and trafficking gangs to boost our border security. State of the art technology and enhanced intelligence capabilities will ensure we are using every tool at our disposal to dismantle this vile trade. I welcome this funding, which will allow us to improve and extend our technology, data exploitation, and capacity-building both internationally and in the UK. Tackling organised immigration crime remains a top priority for the NCA, we are currently leading around 70 investigations into the gangs or individuals involved in the highest echelons of criminality, and we are devoting more resources to it than ever before. We are determined to do all we can to disrupt and dismantle these networks, wherever they operate. CPS Director of Public Prosecutions Stephen Parkinson said: CPS prosecutors will bring significant expertise to the new unit to help stop human trafficking gangs in their tracks, and pursue any assets gained through criminality. Working with partners, we will continue to discourage, disrupt and dismantle this exploitative trade through prosecutions and cross-border collaboration. The announcement coincides with a concerted push by UK ministers to tackle shared border security challenges. Immigration Minister Dame Angela Eagle is attending the annual Berlin Process Interior Ministers’ meeting in Germany today (Tuesday 17 September), to discuss strengthening border security, tackling organised crime groups and combatting violence against women and girls across the Western Balkans region. The meeting brings together European partners with a focus on working with partners across the Western Balkans, a key region in the journey of irregular migrants through Europe and, in many cases, onwards to the UK.
6
+
7
+ Summary events and their dates in the JSON format.
8
+
9
+ Output:
10
+
11
+ {
12
+ "events":[
13
+ {
14
+ "event": "Announcement of up to £75 million in new investment for the UK's Border Security Command by Home Secretary Yvette Cooper.",
15
+ "date": "2024-09-17"
16
+ },
17
+ {
18
+ "event": "Immigration Minister Dame Angela Eagle attending the annual Berlin Process Interior Ministers’ meeting in Germany to discuss strengthening border security.",
19
+ "date": "2024-09-17"
20
+ },
21
+ {
22
+ "event": "An operational summit attended by the Prime Minister at the NCA headquarters where successful upstream disruptions were announced.",
23
+ "date": "Last week (relative to 2024-09-17)"
24
+ },
25
+ {
26
+ "event": "A meeting took place in Rome between the UK Prime Minister and Italian Prime Minister Giorgia Meloni to discuss systematic bilateral cooperation on border security.",
27
+ "date": "Yesterday (relative to 2024-09-17)"
28
+ },
29
+ {
30
+ "event": "The government announced an additional 100 specialist investigators for the NCA, representing a 25% increase in dedicated personnel tackling organised immigration crime.",
31
+ "date": "Last month (relative to 2024-09-17)"
32
+ }
33
+ ]
34
+ }
35
+
36
+ Input:
37
+
38
+ Title: Home Secretary announces new measures to boost Britain’s border security
39
+ Date: 2024-08-21
40
+ Article: Home Secretary announces new measures to boost Britain’s border security Home Secretary announces new measures to strengthen border security, enforce immigration rules and increase returns. New measures to boost Britain’s border security are being set out today (21 August) by the Home Secretary, including the immediate recruitment of up to 100 new specialist intelligence and investigation officers at the National Crime Agency (NCA) to target, dismantle and disrupt organised immigration crime networks. Yvette Cooper has also today announced a major surge in immigration enforcement and returns activity, to make sure that immigration and asylum rules are respected and enforced - saying that the government has new plans for the next 6 months to achieve the highest rate of removals of those with no right to be here, including failed asylum seekers, for 5 years (since 2018). In addition, a new intelligence-driven illegal working programme will be rolled out to target, investigate and take down unscrupulous employers who illegally employ those with no right to work here. The new measures are fulfilling on the government’s commitment to provide long-term security to our borders. - up to 100 new specialist intelligence and investigations officers deployed to the National Crime Agency (NCA) to disrupt and smash criminal smuggling gangs and prevent dangerous boat crossings - a large surge in enforcement and returns flights, with the aim of putting removals at their highest level since 2018, reversing the damaging drop in enforcement over recent years - increased detention capacity including 290 added beds at Campsfield and Haslar Immigration Removal Centres - redeployment of staff to drive this increase in returns - sanctions to be taken against unscrupulous employers who hire workers illegally This comes on top of the 50% uplift in the number of NCA officers stationed in Europol. These officers have been immediately deployed to support European operations to disrupt the activity of criminal smuggling gangs making millions out of small boat crossings. The NCA currently has around 70 investigations targeting the highest harm criminal networks involved in people smuggling and trafficking, and worked with international partners to support the seizure of around 400 boats and engines intended for use in channel crossings. A range of sanctions, including financial penalty notices, business closure orders and potential prosecution, will be taken against those employing illegal workers. Those caught working illegally and eligible for removal will be detained, pending their swift removal. Alongside this, the government is increasing detention spaces to support the higher pace of removals including reopening and adding 290 beds across Immigration Removal Centres (IRCs) at Campsfield and Haslar. This increase will ensure there is additional capacity to facilitate higher levels of enforcement and returns so that rules are properly respected. Building on 9 successful returns flights in the last six weeks, including the largest-ever chartered return flight, the government is redeploying personnel and resources to support further activity. Staff are being redeployed to increase removal of failed asylum seekers, which had dropped by 40% since 2010. Three hundred caseworkers have already been reassigned to progress thousands of failed asylum and returns cases, including enforced and voluntary returns. Enhanced digital capabilities will be deployed to ensure consistent contact throughout, preventing those with no right to be here from disappearing into exploitative illegal working and ensure they can be returned. This enforcement surge, overseen by Bas Javid, the Home Office’s Director General for Immigration Enforcement, is part of the government’s plans to transform the asylum system and secure UK borders. This will ensure that all Immigration Enforcement processes are implemented firmly, fairly, and accurately throughout, whilst also taking account of the important lessons learnt from Windrush. We are taking strong and clear steps boost our border security and ensure the rules are respected and enforced. Our new Border Security Command is already gearing up, with new staff being urgently recruited and additional staff already stationed across Europe, working with European enforcement agencies to find every route in to smashing the criminal smuggling gangs organising dangerous boat crossings which undermine our border security and putting lives at risk. And by increasing enforcement capabilities and returns we will establish a system that is better controlled and managed, in place of the chaos that has blighted the system for far too long. NCA Director General of Operations Rob Jones said: Tackling organised immigration crime remains a key priority for the NCA and we are dedicating more effort and resource than ever before. These extra officers will play a key role in that, with the NCA currently leading around 70 investigations into the highest harm people smuggling and trafficking groups. Taking on these dangerous and exploitative gangs requires international co-operation and we continue to further enhance our already strong relationship with Europol and other law enforcement partners. We are determined to do all we can to disrupt and dismantle these networks, whether they are operating in the UK or overseas. This work builds on the Prime Minister’s meeting at the European Political Community last month, where he held discussions with the migration working group alongside Italy, Albania, Germany, Malta, Denmark, Hungary, The Netherlands, and Slovakia. The European leaders discussed border security, their joint efforts to tackle people-smuggling, and the ambition to work collectively with other countries to deliver solutions. Since taking office, the Home Secretary has also held calls with a range of partners to discuss increasing cooperation to tackle organised immigration crime.
41
+
42
+ Summary events and their dates in the JSON format.
43
+
44
+ Output:
45
+
46
+ {
47
+ "events": [
48
+ {
49
+ "event": "Home Secretary Yvette Cooper announces new measures to boost Britain's border security, including the recruitment of up to 100 new specialist intelligence and investigation officers at the National Crime Agency (NCA).",
50
+ "date": "2024-08-21"
51
+ },
52
+ {
53
+ "event": "Announcement of a major surge in immigration enforcement and returns activity to achieve the highest rate of removals of those with no right to be in the UK since 2018.",
54
+ "date": "2024-08-21"
55
+ },
56
+ {
57
+ "event": "Introduction of a new intelligence-driven illegal working programme to target and take down employers who illegally employ individuals with no right to work in the UK.",
58
+ "date": "2024-08-21"
59
+ },
60
+ {
61
+ "event": "The government announces increased detention capacity, including 290 added beds at Campsfield and Haslar Immigration Removal Centres.",
62
+ "date": "2024-08-21"
63
+ },
64
+ {
65
+ "event": "The Prime Minister's meeting at the European Political Community last month, where discussions were held with European leaders on border security and tackling people-smuggling.",
66
+ "date": "Last month (relative to 2024-08-21)"
67
+ }
68
+ ]
69
+ }
system/initial_searching.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import requests
5
+ import pandas as pd
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ import spacy
9
+ import subprocess
10
+
11
+ try:
12
+ nlp = spacy.load("en_core_web_sm")
13
+ except OSError:
14
+ print("🔁 Downloading en_core_web_sm model ...")
15
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
16
+ nlp = spacy.load("en_core_web_sm")
17
+
18
+ def clean_keywords(text):
19
+ doc = nlp(text)
20
+ keywords = []
21
+ for chunk in doc.noun_chunks:
22
+ words = [token.text for token in chunk if not token.is_stop and token.is_alpha]
23
+ if words:
24
+ cleaned_phrase = " ".join(words)
25
+ if len(cleaned_phrase) > 2:
26
+ keywords.append(cleaned_phrase)
27
+ return list(set(keywords))
28
+
29
+ def google_search(query, api_key, search_engine_id, start_date, end_date):
30
+ print(f"[SYSTEM] Calling Google Search API for: {query}")
31
+ sort = f"date:r:{start_date}:{end_date}"
32
+ url = "https://www.googleapis.com/customsearch/v1"
33
+ params = {
34
+ "q": query,
35
+ "key": api_key,
36
+ "cx": search_engine_id,
37
+ "num": 10,
38
+ "sort": sort,
39
+ "cr": "countryUK",
40
+ "gl": "uk"
41
+ }
42
+ try:
43
+ response = requests.get(url, params=params)
44
+ response.raise_for_status()
45
+ return response.json().get("items", [])
46
+ except Exception as e:
47
+ print(f"[ERROR] Google Search Failed: {e}")
48
+ return []
49
+
50
+ def save_tsv(file_path, claim_id, claim_text, url_list):
51
+ df = pd.DataFrame({
52
+ 'ID': [claim_id] * len(url_list),
53
+ 'String': ["claim"] * len(url_list),
54
+ 'ListValue': url_list,
55
+ 'query': [claim_text] * len(url_list)
56
+ })
57
+ df.to_csv(file_path, sep='\t', index=False, header=False)
58
+
59
+ def ensure_directory_exists(path):
60
+ dir_path = Path(path).expanduser().resolve().parent
61
+ if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
62
+ raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
63
+ dir_path.mkdir(parents=True, exist_ok=True)
64
+
65
+ def run_initial_searching(claim_text, pipeline_base_dir, start_date, end_date, user_id, claim_id):
66
+ api_key = os.environ.get("GOOGLE_API_KEY")
67
+ search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
68
+ if not api_key or not search_engine_id:
69
+ raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
70
+
71
+ base_dir = pipeline_base_dir
72
+ manifesto_json_file = os.path.join(base_dir,"claim.json")
73
+ tsv_file_path = os.path.join(base_dir,"initial_search_results.tsv")
74
+
75
+ ensure_directory_exists(tsv_file_path)
76
+
77
+ claim_record = {"claim_id": claim_id, "claim": claim_text}
78
+ # if manifesto_json_file.exists():
79
+ # with open(manifesto_json_file, "r") as f:
80
+ # records = json.load(f)
81
+ # else:
82
+ records = []
83
+ records.append(claim_record)
84
+ with open(manifesto_json_file, "w") as f:
85
+ json.dump(records, f, indent=1)
86
+
87
+ urls = []
88
+ results = google_search(f"{claim_text}", api_key, search_engine_id, start_date, end_date)
89
+ urls += [r["link"] for r in results if "link" in r]
90
+ keywords = clean_keywords(claim_text)
91
+ keyword_text = " ".join(keywords)
92
+ # for kw in keywords:
93
+ # results = google_search(kw, api_key, search_engine_id, start_date, end_date)
94
+ # urls += [r["link"] for r in results if "link" in r]
95
+ results = google_search(keyword_text, api_key, search_engine_id, start_date, end_date)
96
+ urls += [r["link"] for r in results if "link" in r]
97
+ urls = list(dict.fromkeys(urls))
98
+
99
+ save_tsv(str(tsv_file_path), claim_id, claim_text, urls)
100
+ print(f"[SYSTEM] Saved {len(urls)} URLs for claim {claim_id} to {tsv_file_path}")
101
+ return str(tsv_file_path), str(manifesto_json_file)
system/instruction.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ You are given a pledge, the pledge speaker, and the date of when the pledge is made, and a key event summarized from an online article along with the date of when the event happens. Your task is to determine whether this event summary is useful to track the fulfilment of this pledge.
2
+
3
+ Yes:
4
+ The summary presents developments or actions that demonstrate progress (or lack thereof) towards fulfilling the pledge. It helps evaluate whether the pledge is on track or not.
5
+
6
+ No:
7
+ The summary only provides background or contextual information, but no progress information for evaluating the fulfilment of the pledge. Or the summary is less than or not related to the pledge.
system/pledge_tracking.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ from datetime import datetime
3
+ import os, time
4
+ import pandas as pd
5
+
6
+ from system.initial_searching import run_initial_searching
7
+ from system.scraper import run_scraper
8
+ from system.hero_pipeline import run_hero_pipeline, run_hero_reranking
9
+ from system.augmented_searching import run_augmented_searching
10
+ from system.generate_output import process_manifesto_data_with_metadata
11
+ from system.ee import run_gpt4_event_extraction
12
+ from system.process_time import extract_and_sort_events
13
+ import spacy
14
+ import subprocess
15
+ from huggingface_hub import hf_hub_download
16
+ import json
17
+
18
+ try:
19
+ spacy.load("en_core_web_sm")
20
+ except OSError:
21
+ print("🔁 Downloading en_core_web_sm model ...")
22
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
23
+ nlp = spacy.load("en_core_web_sm")
24
+
25
+
26
+ def count_total_events(output_path):
27
+ with open(output_path, "r", encoding="utf-8") as f:
28
+ results = json.load(f)
29
+
30
+ total_events = 0
31
+ for item in results:
32
+ try:
33
+ events = item["output"]
34
+ if isinstance(events, list):
35
+ total_events += len(events)
36
+ else:
37
+ print(f"invalid: {events}")
38
+ except KeyError:
39
+ print(f"lack item: {item}")
40
+
41
+ print(f"{total_events} events in total")
42
+ return total_events
43
+
44
+ def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_id, update_fn=None, suggestion_meta=None):
45
+ pipeline_base_dir = f"outputs/{timestamp}_{user_id}"
46
+ os.makedirs(pipeline_base_dir, exist_ok=True)
47
+
48
+ step_id=1
49
+
50
+ # Step 1: Google 搜索
51
+ if suggestion_meta==None:
52
+
53
+
54
+ print("🔍 Step 1: Initial searching ...")
55
+ initial_tsv_file, claim_json_path = run_initial_searching(
56
+ claim_text=f"{pledge_author} : {claim}",
57
+ # pledge_author=pledge_author,
58
+ pipeline_base_dir=pipeline_base_dir,
59
+ start_date=start_date,
60
+ end_date="",
61
+ user_id=user_id,
62
+ claim_id=0,
63
+ )
64
+ with open(initial_tsv_file, "r", encoding="utf-8") as f:
65
+ line_count = sum(1 for line in f)
66
+ if update_fn:
67
+ update_fn(step_id, f"We have found {line_count} URLs")
68
+ step_id+=1
69
+
70
+
71
+ print("🌐 Step 2: Scraping URLs ...")
72
+ initial_data_store_dir = os.path.join(pipeline_base_dir, "initial_data_store")
73
+ os.makedirs(initial_data_store_dir, exist_ok=True)
74
+ initial_scraped_output_path = os.path.join(initial_data_store_dir, "0.jsonl")
75
+ run_scraper(initial_tsv_file, initial_scraped_output_path)
76
+
77
+ with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
78
+ line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
79
+ if update_fn:
80
+ update_fn(step_id, f"We have scraped {line_count} URLs")
81
+ step_id+=1
82
+
83
+
84
+ print("🧠 Step 3: HerO processing ...")
85
+ hero_output_dir = os.path.join(pipeline_base_dir, "hero")
86
+ os.makedirs(hero_output_dir, exist_ok=True)
87
+ run_hero_pipeline(pipeline_base_dir)
88
+
89
+ qa_file_path = os.path.join(hero_output_dir, "manifesto_icl_top_k_qa.json")
90
+
91
+ with open(qa_file_path, "r", encoding="utf-8") as f:
92
+ questions = {line["question"] for line in json.load(f)["evidence"]}
93
+ line_count = len(questions)
94
+ if update_fn:
95
+ update_fn(step_id, f"We have generated {line_count} search queries")
96
+ step_id+=1
97
+
98
+ else:
99
+ claim_json_path = None
100
+ initial_scraped_output_path = None
101
+ initial_tsv_file = None
102
+ hero_output_dir = None
103
+ qa_file_path = hf_hub_download(
104
+ repo_id="PledgeTracker/demo_feedback",
105
+ filename="manifesto_with_QA_icl_top_k_qa.json",
106
+ repo_type="dataset",
107
+ token=os.environ["HF_TOKEN"]
108
+ )
109
+ print(qa_file_path)
110
+
111
+
112
+ augmented_tsv_file = run_augmented_searching(
113
+ qa_file=qa_file_path,
114
+ pledge_author=pledge_author,
115
+ pipeline_base_dir=pipeline_base_dir,
116
+ start_date=start_date,
117
+ suggestion_meta=suggestion_meta,
118
+ end_date="",
119
+ user_id=user_id,
120
+ claim_id=0,
121
+ )
122
+ with open(augmented_tsv_file, "r", encoding="utf-8") as f:
123
+ line_count = sum(1 for line in f)
124
+ if update_fn:
125
+ update_fn(step_id, f"We have found {line_count} URLs")
126
+ step_id+=1
127
+
128
+ augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
129
+ os.makedirs(augmented_data_store_dir, exist_ok=True)
130
+ augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
131
+ run_scraper(augmented_tsv_file, augmented_scraped_output_path)
132
+
133
+ with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
134
+ line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
135
+ if update_fn:
136
+ update_fn(step_id, f"We have scraped {line_count} URLs")
137
+ step_id+=1
138
+
139
+
140
+ run_hero_reranking(pipeline_base_dir, suggestion_meta)
141
+
142
+ # Step 7: Preparing for GPT-4
143
+ # print("🧠 Step 7: Processing format ...")
144
+
145
+ meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
146
+
147
+ # Step 8: Event extraction using GPT-4
148
+ print("🧠 Extracting events ...")
149
+
150
+ all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
151
+ unique_urls = set()
152
+ with open(all_info_path, "r", encoding="utf-8") as f:
153
+ for line in f:
154
+ data = json.loads(line)
155
+ docs = data.get("evidence", [])
156
+ for doc in docs:
157
+ if "url" in doc:
158
+ unique_urls.add(doc["url"])
159
+ if update_fn:
160
+ update_fn(step_id, f"We have found {len(unique_urls)} most relevant documents")
161
+ step_id+=1
162
+
163
+ extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, icl_path="system/icl.txt", max_tokens=100000)
164
+
165
+ events_num = count_total_events(extracted_event_path)
166
+
167
+ if update_fn:
168
+ update_fn(step_id, f"We have extracted {events_num}")
169
+ step_id+=1
170
+
171
+
172
+ # Step 9: Sorting events and label usefulness
173
+ print("📅 Sorting events temporally ...")
174
+
175
+
176
+ sorted_events = extract_and_sort_events(
177
+ data_dir=pipeline_base_dir,
178
+ pledge_date=pledge_date,
179
+ pledge_author=pledge_author,
180
+ claim=claim,
181
+ suggestion_meta=suggestion_meta
182
+ )
183
+ print(sorted_events)
184
+ df = pd.DataFrame(sorted_events)
185
+ sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
186
+ df.to_excel(sorted_event_path, index=False)
187
+
188
+ if update_fn:
189
+ update_fn(step_id, "All events are sorted!")
190
+
191
+ return {
192
+ "claim_json": claim_json_path,
193
+ "initial_scraped_jsonl": initial_scraped_output_path,
194
+ "initial_tsv_file": initial_tsv_file,
195
+ "hero_dir": hero_output_dir,
196
+ "augmented_scraped_jsonl": augmented_scraped_output_path,
197
+ "augmented_tsv_file": augmented_tsv_file,
198
+ "meta_data_dir": meta_data_dir,
199
+ "unsorted_events": extracted_event_path,
200
+ "sorted_events": sorted_event_path,
201
+ }
202
+
203
+
204
+ if __name__ == "__main__":
205
+ start = time.time()
206
+
207
+ if os.environ.get("HF_TOKEN"):
208
+ login(token=os.environ["HF_TOKEN"])
209
+ else:
210
+ print("No Hugging Face token found in environment variable HF_TOKEN.")
211
+
212
+ claim = "“We will support families with children by introducing free breakfast clubs in every primary school”"
213
+ start_date = "20250504"
214
+ timestamp = "xxxxx"
215
+ user_id = "xxx"
216
+
217
+ outputs = run_pipeline(claim, time_start, timestamp, user_id)
218
+ print("🎯 Pipeline finished. Outputs:", outputs)
219
+ print(f"⏱️ Total time: {time.time() - start:.2f} seconds")
system/process_time.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import datetime
3
+ import re
4
+ import pandas as pd
5
+ import os, argparse
6
+ import random
7
+ import csv
8
+ from openai import OpenAI
9
+ from huggingface_hub import hf_hub_download
10
+ import json
11
+ import os
12
+
13
+
14
+
15
+ def gpt_4o_useful(input):
16
+ client=OpenAI(api_key=os.environ.get("OAI"))
17
+ response = client.chat.completions.create(
18
+ model="gpt-4o",
19
+ messages=[
20
+ {
21
+ "role": "user",
22
+ "content": [
23
+ {
24
+ "type": "text",
25
+ "text": input
26
+ }
27
+ ]
28
+ }
29
+ ],
30
+ response_format={"type": "text"},
31
+ temperature=0.0000000001,
32
+ max_tokens=4096,
33
+ top_p=0,
34
+ frequency_penalty=0,
35
+ presence_penalty=0,
36
+ logprobs=True
37
+ )
38
+
39
+ text = response.choices[0].message.content
40
+
41
+ if response.choices[0].logprobs and response.choices[0].logprobs.content:
42
+ first_token_logprob = response.choices[0].logprobs.content[0]
43
+ token = first_token_logprob.token
44
+ logprob = first_token_logprob.logprob
45
+ else:
46
+ token = None
47
+ logprob = None
48
+
49
+ return text, token, logprob
50
+
51
+
52
+
53
+ def get_ICL(data, top_k=None):
54
+
55
+ ICL =""
56
+ if top_k == None:
57
+ data = data
58
+ else:
59
+ # print(data)
60
+ data = data[:top_k]
61
+ for line in data:
62
+ # line = json.loads(line)
63
+ pledge = line["pledge"]
64
+ event = line["event_description"]
65
+ time = line["event_date"]
66
+ input=f"Pledge: {pledge}\nEvent Summary: {event} (Event Date: {time})\nIs this event summary useful?"
67
+ input = input.strip()
68
+ output = line["label"].strip()
69
+ ICL = f"{ICL}Input: {input}\nOutput: {output}\n\n"
70
+ return ICL
71
+
72
+ def load_json(file_path):
73
+ with open(file_path, 'r', encoding='utf-8') as f:
74
+ data = json.load(f)
75
+ return data
76
+
77
+
78
+ def gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=None):
79
+
80
+ if suggestion_meta:
81
+ # print(ICL_id)
82
+
83
+ train_data = [line for line in train_data if str(line.get("pledge_id")) == str(ICL_id)]
84
+
85
+ else:
86
+ random.seed(42)
87
+ random.shuffle(train_data)
88
+
89
+ ICL = get_ICL(train_data, top_k=50)
90
+ # print(ICL)
91
+ input = f"{instruction}\nBelow are examples:\n\n{ICL}Now, please assign a label for the below instance.\nInput: {test_instance}\nOutput:"
92
+
93
+ try:
94
+ text, tokens, logprobs = gpt_4o_useful(input)
95
+ except Exception as e:
96
+ print(e)
97
+ tokens = None
98
+ logprobs = None
99
+
100
+ return tokens, logprobs
101
+
102
+ def extract_columns_to_dict(file_path, delimiter='\t'):
103
+
104
+ data_dict = {}
105
+
106
+ with open(file_path, mode='r', encoding='utf-8') as file:
107
+ reader = csv.reader(file, delimiter=delimiter)
108
+ for row in reader:
109
+ if len(row) >= 4:
110
+ key = row[2]
111
+ value = row[3]
112
+ data_dict[key] = value
113
+
114
+ return data_dict
115
+
116
+
117
+ def parse_date(date_str):
118
+ try:
119
+ return datetime.datetime.strptime(date_str, "%Y-%m-%d"), date_str
120
+ except ValueError:
121
+ match = re.search(r'(.*) \(relative to (\d{4}-\d{2}-\d{2})\)', date_str)
122
+ if match:
123
+ reference = datetime.datetime.strptime(match.group(2), "%Y-%m-%d")
124
+ if "Last month" in match.group(1):
125
+ return reference - datetime.timedelta(days=30), date_str
126
+ elif "Yesterday" in match.group(1):
127
+ return reference - datetime.timedelta(days=1), date_str
128
+ elif "Last week" in match.group(1):
129
+ return reference - datetime.timedelta(days=7), date_str
130
+ elif "This week" in match.group(1):
131
+ return reference, date_str
132
+
133
+ # 处理不同格式的日期
134
+ match = re.fullmatch(r'\d{4}', date_str) # 处理年份格式: '2014'
135
+ if match:
136
+ return datetime.datetime(int(date_str), 1, 1), date_str
137
+
138
+ match = re.fullmatch(r'(\w+) (\d{4})', date_str) # 处理月份+年份格式: 'November 2023'
139
+ if match:
140
+ try:
141
+ return datetime.datetime.strptime(date_str, "%B %Y"), date_str
142
+ except ValueError:
143
+ return None, date_str
144
+
145
+ match = re.fullmatch(r'(\d{4})-Q(\d)', date_str) # 处理季度格式: '2024-Q1'
146
+ if match:
147
+ year, quarter = int(match.group(1)), int(match.group(2))
148
+ month = (quarter - 1) * 3 + 1
149
+ return datetime.datetime(year, month, 1), date_str
150
+
151
+ match = re.fullmatch(r'(\d{4}) (Spring|Summer|Autumn|Fall|Winter)', date_str, re.IGNORECASE) # 处理季度名称格式: '2023 Autumn' 或 '2023 Fall'
152
+ if match:
153
+ year = int(match.group(1))
154
+ season_map = {"Spring": 3, "Summer": 6, "Autumn": 9, "Fall": 9, "Winter": 12}
155
+ month = season_map[match.group(2).capitalize()]
156
+ return datetime.datetime(year, month, 1), date_str
157
+
158
+ return None, date_str
159
+
160
+ def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggestion_meta):
161
+
162
+ events = []
163
+
164
+ # url_path = os.path.join(data_dir, "augmented_search_results.tsv")
165
+ # url_query_dict = extract_columns_to_dict(file_path=url_path, delimiter='\t')
166
+
167
+ pledge = claim.strip()
168
+
169
+ file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json")
170
+ gpt4_results_json = load_json(file_path)
171
+
172
+ print(gpt4_results_json)
173
+ train_file_path = hf_hub_download(
174
+ repo_id="PledgeTracker/demo_feedback",
175
+ filename="train_useful.json",
176
+ repo_type="dataset",
177
+ token=os.environ["HF_TOKEN"]
178
+ )
179
+
180
+ with open(train_file_path, "r", encoding="utf-8") as f:
181
+ train_data = json.load(f)
182
+ print(train_data[0])
183
+
184
+ instruction = open(f"system/instruction.txt", "r").read()
185
+
186
+ map_file_path = hf_hub_download(
187
+ repo_id="PledgeTracker/demo_feedback",
188
+ filename="mapping.txt",
189
+ repo_type="dataset",
190
+ token=os.environ["HF_TOKEN"]
191
+ )
192
+ mapping_f = open(map_file_path, "r").readlines()
193
+ mapping = {}
194
+
195
+ for map_id, line in enumerate(mapping_f):
196
+ mapping[map_id] = int(line.strip())
197
+
198
+ ICL_id = None
199
+ if suggestion_meta:
200
+ try:
201
+ idx = int(suggestion_meta["index"])
202
+ ICL_id = mapping.get(idx)
203
+ print(f"[Suggestion] index: {idx} → pledge_id: {ICL_id}")
204
+ except Exception as e:
205
+ print(f"[Mapping error]: {e}")
206
+
207
+ for doc in gpt4_results_json:
208
+ mete_date = doc["date"]
209
+ for event in doc.get("output", {}).get("events", []):
210
+ parsed_date, original_date = parse_date(event["date"])
211
+ if parsed_date:
212
+ if mete_date!= parsed_date:
213
+ event_date_and_pub_date = original_date+f" ({mete_date})"
214
+ else:
215
+ event_date_and_pub_date = original_date
216
+
217
+ test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful?"
218
+
219
+ print(test_instance)
220
+
221
+ label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id)
222
+
223
+ URL = doc["url"]
224
+ events.append({
225
+ "date": original_date,
226
+ "event date (publication date if different)": event_date_and_pub_date,
227
+ "event": event["event"],
228
+ "url": URL,
229
+ "label": label,
230
+ "confident": score
231
+ })
232
+
233
+ # 按时间排序
234
+ events.sort(key=lambda x: parse_date(x["date"])[0], reverse=True)
235
+ return events
236
+
237
+
238
+
system/scraper.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ import os
3
+ import csv
4
+ import json
5
+ import fitz
6
+ import time
7
+ import requests
8
+ import pandas as pd
9
+ from time import sleep
10
+ from pathlib import Path
11
+ from system.html2lines import url2lines, line_correction, html2metadata
12
+
13
+ MAX_RETRIES = 3
14
+ TIMEOUT = 5 # seconds
15
+
16
+
17
+ def scrape_text_from_url(url, temp_name):
18
+ response = None
19
+ for attempt in range(MAX_RETRIES):
20
+ try:
21
+ response = requests.get(url, timeout=TIMEOUT)
22
+ break
23
+ except requests.RequestException:
24
+ if attempt < MAX_RETRIES - 1:
25
+ sleep(3)
26
+
27
+ if response is None or response.status_code == 503:
28
+ return []
29
+
30
+ if url.endswith(".pdf"):
31
+ pdf_dir = Path("/tmp/pdf_dir")
32
+ pdf_dir.mkdir(parents=True, exist_ok=True)
33
+ pdf_path = pdf_dir / f"{temp_name}.pdf"
34
+ with open(pdf_path, "wb") as f:
35
+ f.write(response.content)
36
+
37
+ extracted_text = ""
38
+ doc = fitz.open(str(pdf_path))
39
+ for page in doc:
40
+ extracted_text += page.get_text() or ""
41
+
42
+ return line_correction(extracted_text.split("\n"))
43
+
44
+ return line_correction(url2lines(url))
45
+
46
+ def process_row(row, claim_id):
47
+ try:
48
+ url = row[2]
49
+ json_data = {
50
+ "claim_id": claim_id,
51
+ "type": row[1],
52
+ "query": row[3],
53
+ "url": url,
54
+ "url2text": scrape_text_from_url(url, claim_id),
55
+ "metadata": {}
56
+ }
57
+ meta = html2metadata(url)
58
+ json_data["metadata"] = {
59
+ "title": meta.get("title"),
60
+ "date": meta.get("date")
61
+ }
62
+ return json_data
63
+ except Exception as e:
64
+ print(f"[WARN] Failed to scrape {row[2]}: {e}")
65
+ return None
66
+
67
+ def run_scraper(tsv_file_path: str, output_jsonl_path: str, max_workers: int = 10):
68
+ claim_id = Path(tsv_file_path).stem
69
+ output_jsonl_path = Path(output_jsonl_path)
70
+ output_jsonl_path.parent.mkdir(parents=True, exist_ok=True)
71
+
72
+ if output_jsonl_path.exists():
73
+ print(f"[INFO] Skipping processing as output file already exists: {output_jsonl_path}")
74
+ return str(output_jsonl_path)
75
+
76
+ try:
77
+ df = pd.read_csv(tsv_file_path, sep="\t", header=None)
78
+ print("[INFO] Data loaded successfully with Pandas.")
79
+ except Exception as e:
80
+ raise RuntimeError(f"[ERROR] Failed to load TSV: {e}")
81
+
82
+ results = []
83
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
84
+ futures = [executor.submit(process_row, row, claim_id) for _, row in df.iterrows()]
85
+ for future in as_completed(futures):
86
+ result = future.result()
87
+ if result:
88
+ results.append(result)
89
+
90
+ with open(output_jsonl_path, "w", encoding="utf-8") as json_file:
91
+ for item in results:
92
+ json_file.write(json.dumps(item, ensure_ascii=False) + "\n")
93
+
94
+ print(f"[SYSTEM] Output saved to {output_jsonl_path}")
95
+ return str(output_jsonl_path)
system/test.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ import json
3
+ import os
4
+
5
+ file_path = hf_hub_download(
6
+ repo_id="PledgeTracker/demo_feedback", # 你的私密 dataset 名
7
+ filename="train_useful.json", # 你上传的文件名
8
+ repo_type="dataset", # 必须设置为 dataset 类型
9
+ token=os.environ["HF_TOKEN"] # 需要 HF token 才能访问私密文件
10
+ )
11
+
12
+ with open(file_path, "r", encoding="utf-8") as f:
13
+ train_data = json.load(f)
14
+
15
+ print(train_data[0])
test.html ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
6
+ <title>Pledge Tracker – Demo</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ </head>
9
+ <body class="bg-gray-50 text-gray-800">
10
+ <header class="bg-white shadow py-4 sticky top-0 z-10">
11
+ <div class="container mx-auto flex items-center justify-between px-4">
12
+ <div class="flex items-center gap-2">
13
+ <span class="text-2xl font-bold text-purple-600">🤗</span>
14
+ <span class="font-semibold text-lg">Pledge Tracking</span>
15
+ </div>
16
+ <nav class="hidden md:flex gap-6 font-medium">
17
+ <a class="hover:text-purple-600" href="#eval-response">Track Your Pledge</a>
18
+ <a class="hover:text-purple-600" href="#about">About</a>
19
+ </nav>
20
+ </div>
21
+ </header>
22
+
23
+ <section class="py-16 bg-gradient-to-r from-purple-50 to-purple-50 text-center">
24
+ <div class="container mx-auto px-4 max-w-2xl">
25
+ <h1 class="text-3xl md:text-4xl font-extrabold mb-4">
26
+ Fact-Checking Election Promises
27
+ </h1>
28
+ <p class="text-lg text-gray-600">
29
+ Extract progress towards fulfilling the promise.
30
+ </p>
31
+ </div>
32
+ </section>
33
+
34
+ <section id="eval-response" class="py-12">
35
+ <div class="container mx-auto px-4 max-w-4xl">
36
+ <!-- <h2 class="text-2xl font-bold mb-6">Track Manifesto Pledge</h2> -->
37
+ <label for="claim" class="block text-sm font-medium mb-2">
38
+ Please enter the pledge:
39
+ </label>
40
+ <textarea
41
+ id="claim"
42
+ class="w-full border rounded-lg p-3 h-40 focus:outline-none focus:ring-2 focus:ring-purple-500"
43
+ placeholder="For example: 'We will support families with children by introducing free breakfast clubs in every primary school...'"
44
+ ></textarea>
45
+
46
+ <div id="similar-suggestions" class="mt-3 text-sm text-gray-600 hidden"></div>
47
+
48
+ <div class="mt-4">
49
+ <label for="pledge-date" class="block text-sm font-medium mb-2">
50
+ When was this pledge made?
51
+ </label>
52
+ <div class="grid grid-cols-[1fr_auto] items-center gap-2">
53
+ <input
54
+ type="date"
55
+ id="pledge-date"
56
+ class="w-full border rounded-lg p-2"
57
+ />
58
+ <button
59
+ onclick="setDefaultDate()"
60
+ type="button"
61
+ class="px-2 py-1 text-sm bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
62
+ >
63
+ Use default: 4th Jul 2024
64
+ </button>
65
+ </div>
66
+ <div id="date-warning" class="text-sm text-red-600 mt-1 hidden">
67
+ Please select a date or click the button to use the default.
68
+ </div>
69
+ </div>
70
+
71
+ <div class="mt-4">
72
+ <label for="pledge-author" class="block text-sm font-medium mb-2">
73
+ Who made this pledge?
74
+ </label>
75
+ <div class="grid grid-cols-[1fr_auto] items-center gap-2">
76
+ <input
77
+ type="text"
78
+ id="pledge-author"
79
+ class="w-full border rounded-lg p-2"
80
+ placeholder="Enter the name of the party or person"
81
+ />
82
+ <button
83
+ onclick="setDefaultAuthor()"
84
+ type="button"
85
+ class="px-2 py-1 text-sm bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
86
+ >
87
+ Use default: Labour
88
+ </button>
89
+ </div>
90
+ <div id="author-warning" class="text-sm text-red-600 mt-1 hidden">
91
+ Please enter a speaker or click the button to use the default.
92
+ </div>
93
+ </div>
94
+
95
+
96
+
97
+ <label for="time-range" class="block text-sm font-medium mt-4 mb-2">
98
+ Please select a time range:
99
+ </label>
100
+ <select id="time-range" class="w-full border rounded-lg p-2">
101
+ <option value="week">Past one week</option>
102
+ <option value="month">Past one month</option>
103
+ <!-- <option value="year">From when the pledge was made</option> -->
104
+ <option value="since_pledge_date">From when the pledge was made</option>
105
+ </select>
106
+
107
+ <button
108
+ id="check"
109
+ class="mt-4 px-6 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
110
+ >
111
+ Let's fact check!
112
+ </button>
113
+
114
+ <div id="progress" class="mt-6 hidden border p-4 rounded-lg bg-white shadow">
115
+ <h3 class="font-semibold mb-2">System Progress</h3>
116
+ <div id="status" class="text-sm text-gray-800 font-normal leading-relaxed"></div>
117
+ </div>
118
+
119
+
120
+ <div id="result" class="mt-6 hidden border p-4 rounded-lg bg-white shadow">
121
+ <h3 class="font-semibold mb-2">Result</h3>
122
+ <p class="text-gray-700"></p>
123
+ </div>
124
+ </div>
125
+ </section>
126
+
127
+ <section id="about" class="py-12">
128
+ <div class="container mx-auto px-4 max-w-4xl">
129
+ <h2 class="text-2xl font-bold mb-6">About</h2>
130
+ <p class="text-gray-700 leading-relaxed">
131
+ This demo connects a static front-end with a Python back-end using Flask.
132
+ The back-end generates event data and returns structured events related
133
+ to a manifesto pledge.
134
+ </p>
135
+ </div>
136
+ </section>
137
+
138
+
139
+
140
+
141
+ <script>
142
+ let suggestedPledge = null;
143
+ let currentAbortController = null;
144
+ const feedbackData = {};
145
+ let lastUsedFile = null;
146
+ let lastUserId = null;
147
+ let lastTimestamp = null;
148
+ const checkBtn = document.getElementById("check");
149
+
150
+ const stepListStandard = {
151
+ 1: "Retrieving evidence related to the pledge",
152
+ 2: "Scraping documents from URLs",
153
+ 3: "Generating more queries based on the retrieved evidence",
154
+ 4: "Searching more articles",
155
+ 5: "Scraping documents from URLs",
156
+ 6: "Finding the most relevant documents",
157
+ 7: "Extracting events from top documents",
158
+ 8: "Sorting events temporally"
159
+ };
160
+
161
+ const stepListSuggestion = {
162
+ 1: "Retrieving evidence based on genertaed queries",
163
+ 2: "Scraping documents from URLs",
164
+ 3: "Finding the most relevant documents",
165
+ 4: "Extracting events from top documents",
166
+ 5: "Sorting events temporally"
167
+ };
168
+
169
+ let stepList = stepListStandard;
170
+
171
+ function renderStatus(statusDict) {
172
+ let html = "<ul class='list-disc ml-6 space-y-1 text-sm'>";
173
+ for (let step in stepList) {
174
+ const content = statusDict?.[step] || stepList[step];
175
+ const prefix = statusDict?.[step] ? "✅" : "⏳";
176
+ html += `<li>${prefix} Step ${step}: ${content}</li>`;
177
+ }
178
+ html += "</ul>";
179
+ return html;
180
+ }
181
+
182
+ function setDefaultDate() {
183
+ const input = document.getElementById("pledge-date");
184
+ input.value = "2024-07-04";
185
+ document.getElementById("date-warning").classList.add("hidden");
186
+ }
187
+
188
+ function setDefaultAuthor() {
189
+ const input = document.getElementById("pledge-author");
190
+ input.value = "Labour";
191
+ document.getElementById("author-warning").classList.add("hidden");
192
+ }
193
+
194
+ // function setFeedback(index, answer) {
195
+ // feedbackData[index] = answer;
196
+ // const message = document.getElementById(`msg-${index}`);
197
+ // message.textContent = `✓ Selected: ${answer ? 'Yes' : 'No'}`;
198
+ // message.className = answer
199
+ // ? "text-sm text-green-600 mt-1"
200
+ // : "text-sm text-red-600 mt-1";
201
+ // }
202
+ function setFeedback(index, answer) {
203
+ feedbackData[index] = answer;
204
+ const message = document.getElementById(`msg-${index}`);
205
+
206
+ let displayText = "";
207
+ let colorClass = "";
208
+
209
+ switch(answer) {
210
+ case "not_relevant":
211
+ displayText = "Not relevant";
212
+ colorClass = "text-red-300";
213
+ break;
214
+ case "relevant_seen":
215
+ displayText = "Relevant but already seen";
216
+ colorClass = "text-grey-400";
217
+ break;
218
+ case "relevant_updated":
219
+ displayText = "Relevant and up-to-date";
220
+ colorClass = "text-blue-400";
221
+ break;
222
+ }
223
+
224
+ message.textContent = `✓ Selected: ${displayText}`;
225
+ message.className = `text-sm ${colorClass} mt-1`;
226
+ }
227
+
228
+ function pollStatus(userId, timestamp, statusElement) {
229
+ if (window.pollIntervalId) {
230
+ clearInterval(window.pollIntervalId);
231
+ }
232
+
233
+ window.pollIntervalId = setInterval(async () => {
234
+ try {
235
+ const res = await fetch(`/api/status?user_id=${userId}&timestamp=${timestamp}&_=${Date.now()}`);
236
+ const data = await res.json();
237
+
238
+ // 动态渲染结构化状态
239
+ if (data.status) {
240
+ statusElement.innerHTML = renderStatus(data.status);
241
+ }
242
+
243
+ // 检查是否完成
244
+ const values = Object.values(data.status || {});
245
+ const finalText = values.join(" ").toLowerCase();
246
+
247
+ if (finalText.includes("done") || finalText.includes("finished")) {
248
+ clearInterval(window.pollIntervalId);
249
+ window.pollIntervalId = null;
250
+ statusElement.innerHTML += `<div class="mt-2 text-green-600 font-semibold">✅ All done.</div>`;
251
+ checkBtn.disabled = false;
252
+ checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
253
+ if (lastUsedFile) loadEvents(lastUsedFile);
254
+ } else if (finalText.includes("error") || finalText.includes("fail")) {
255
+ clearInterval(window.pollIntervalId);
256
+ window.pollIntervalId = null;
257
+ statusElement.innerHTML += `<div class="mt-2 text-red-600 font-semibold">❌ The process failed.</div>`;
258
+ checkBtn.disabled = false;
259
+ checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
260
+ }
261
+ } catch (err) {
262
+ clearInterval(window.pollIntervalId);
263
+ window.pollIntervalId = null;
264
+ statusElement.innerHTML = `<div class="text-red-600">❌ Failed to check status: ${err.message}</div>`;
265
+ }
266
+ }, 2000);
267
+ }
268
+
269
+
270
+
271
+ async function submitAllFeedback() {
272
+ const entries = Object.entries(feedbackData);
273
+ if (entries.length === 0) {
274
+ alert("No feedback to submit!");
275
+ return;
276
+ }
277
+ const confirmed = confirm("Submit all feedback?");
278
+ if (!confirmed) return;
279
+
280
+ const pledgeText = document.getElementById("claim").value.trim();
281
+
282
+ const res = await fetch('/api/feedback', {
283
+ method: 'POST',
284
+ headers: { 'Content-Type': 'application/json' },
285
+ body: JSON.stringify({
286
+ pledge: pledgeText,
287
+ file: lastUsedFile,
288
+ user_id: lastUserId,
289
+ timestamp: lastTimestamp,
290
+ feedback: entries.map(([index, answer]) => ({
291
+ eventIndex: index,
292
+ answer: answer
293
+ }))
294
+ })
295
+ });
296
+
297
+ alert(res.ok ? "✅ Feedback submitted successfully!" : "❌ Submission failed.");
298
+ }
299
+
300
+ async function loadEvents(file) {
301
+ const resultBox = document.getElementById("result");
302
+ const p = resultBox.querySelector("p");
303
+ resultBox.classList.remove("hidden");
304
+
305
+ try {
306
+ const fileParam = encodeURIComponent(file);
307
+ const eventsRes = await fetch(`/api/events?file=${fileParam}`);
308
+ if (!eventsRes.ok) throw new Error("Event file not found or malformed");
309
+ const data = await eventsRes.json();
310
+ if (!Array.isArray(data)) throw new Error("Unexpected data format");
311
+
312
+ p.innerHTML = `<strong>We have found ${data.length} events for this pledge.</strong><br><br>` +
313
+ data.map((e, index) => `
314
+ <div class="mb-6 border-b pb-4">
315
+ 🗓️ <b>${e.date}</b>: ${e.event}<br>
316
+ 🔗 <a href="${e.url}" target="_blank" class="text-purple-400 underline">Source</a>
317
+
318
+ <div class="mt-3">
319
+ <label class="block text-sm font-medium mb-2">How relevant is this event?</label>
320
+ <div class="flex flex-wrap gap-2">
321
+ <button onclick="setFeedback(${index}, 'not_relevant')"
322
+ class="px-3 py-1.5 bg-gray-100 hover:bg-gray-200 border border-gray-300 rounded-lg text-gray-700">
323
+ Not relevant
324
+ </button>
325
+ <button onclick="setFeedback(${index}, 'relevant_seen')"
326
+ class="px-3 py-1.5 bg-blue-100 hover:bg-blue-200 border border-blue-300 rounded-lg text-blue-700">
327
+ Relevant but seen
328
+ </button>
329
+ <button onclick="setFeedback(${index}, 'relevant_updated')"
330
+ class="px-3 py-1.5 bg-green-100 hover:bg-green-200 border border-green-300 rounded-lg text-green-700">
331
+ Relevant & up-to-date
332
+ </button>
333
+ </div>
334
+ <div id="msg-${index}" class="text-sm mt-1"></div>
335
+ </div>
336
+ </div>
337
+ `).join('') +
338
+ `<button onclick="submitAllFeedback()" class="mt-6 px-4 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700">
339
+ 📤 Submit All Feedback
340
+ </button>
341
+ <button onclick="window.location.href='/download?file=${fileParam}'" class="mt-4 ml-4 px-4 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700">
342
+ 📅 Download Excel
343
+ </button>`;
344
+ } catch (err) {
345
+ p.textContent = `❌ Failed to load timeline: ${err.message}`;
346
+ }
347
+ }
348
+
349
+ let suggestTimer = null;
350
+ document.getElementById("claim").addEventListener("input", () => {
351
+ clearTimeout(suggestTimer);
352
+ suggestTimer = setTimeout(fetchSuggestions, 300); // 300ms delay to avoid flooding
353
+ });
354
+
355
+ async function fetchSuggestions() {
356
+ const claimText = document.getElementById("claim").value.trim();
357
+ const suggestionBox = document.getElementById("similar-suggestions");
358
+
359
+ if (!claimText) {
360
+ suggestionBox.classList.add("hidden");
361
+ return;
362
+ }
363
+
364
+ const res = await fetch("/api/similar-pledges", {
365
+ method: "POST",
366
+ headers: { "Content-Type": "application/json" },
367
+ body: JSON.stringify({ claim: claimText })
368
+ });
369
+ const data = await res.json();
370
+ const suggestions = data.suggestions || [];
371
+
372
+ if (suggestions.length === 0) {
373
+ suggestionBox.classList.add("hidden");
374
+ } else {
375
+ const author = "Labour";
376
+ const date = "2024-07-04";
377
+ suggestionBox.innerHTML =
378
+ "<div class='font-semibold mb-1'>💡 Are you fact-checking ... </div>" +
379
+ "<ul class='list-disc ml-6 mt-1'>" +
380
+ suggestions.map(s => `
381
+ <li class="mb-2">
382
+ ${author}: ${s.text} (${date})
383
+ <button
384
+ onclick="useSuggestedPledge('${s.text.replace(/'/g, "\\'")}', ${s.index})"
385
+ class="ml-2 px-2 py-1 text-xs bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500">
386
+ Fact-check this pledge
387
+ </button>
388
+ </li>
389
+ `).join("") +
390
+ "</ul>";
391
+ suggestionBox.classList.remove("hidden");
392
+ }
393
+ }
394
+
395
+
396
+ checkBtn.addEventListener("click", async () => {
397
+ const claim = document.getElementById("claim").value.trim();
398
+ const pledgeDate = document.getElementById("pledge-date").value.trim();
399
+ const pledgeAuthor = document.getElementById("pledge-author").value.trim();
400
+ const statusElement = document.getElementById("status");
401
+ const resultBox = document.getElementById("result");
402
+ // resultBox.classList.remove("hidden");
403
+ const p = resultBox.querySelector("p");
404
+
405
+
406
+
407
+ let valid = true;
408
+ if (!claim) {
409
+ alert("Please enter the pledge text.");
410
+ valid = false;
411
+ }
412
+ if (!pledgeDate) {
413
+ document.getElementById("date-warning").classList.remove("hidden");
414
+ valid = false;
415
+ }
416
+ if (!pledgeAuthor) {
417
+ document.getElementById("author-warning").classList.remove("hidden");
418
+ valid = false;
419
+ }
420
+
421
+ if (!valid) return;
422
+
423
+ checkBtn.disabled = true;
424
+ checkBtn.classList.add("opacity-50", "cursor-not-allowed");
425
+
426
+ // document.getElementById("status").classList.remove("hidden");
427
+ statusElement.innerHTML = renderStatus({});
428
+ document.getElementById("result").classList.remove("hidden");
429
+ document.getElementById("progress").classList.remove("hidden");
430
+
431
+
432
+
433
+ try {
434
+ const timeRange = document.getElementById("time-range").value;
435
+ const pledgeDate = document.getElementById("pledge-date").value;
436
+ const pledgeAuthor = document.getElementById("pledge-author").value;
437
+ if (currentAbortController) currentAbortController.abort();
438
+ currentAbortController = new AbortController();
439
+ const signal = currentAbortController.signal;
440
+ let valid = true;
441
+
442
+ stepList = (suggestedPledge !== null) ? stepListSuggestion : stepListStandard;
443
+
444
+ if (!pledgeDate) {
445
+ document.getElementById("date-warning").classList.remove("hidden");
446
+ valid = false;
447
+ }
448
+ if (!pledgeAuthor) {
449
+ document.getElementById("author-warning").classList.remove("hidden");
450
+ valid = false;
451
+ }
452
+ if (!valid) return;
453
+
454
+ const userId = Math.random().toString(36).substring(2, 10);
455
+ const now = new Date();
456
+ const timestamp = now.toISOString().replace(/[:.]/g, "-").slice(0, 19);
457
+ statusElement.textContent = "";
458
+ // pollStatus(userId, timestamp, p);
459
+ pollStatus(userId, timestamp, document.getElementById("status"));
460
+
461
+
462
+ const runRes = await fetch("/api/run-model", {
463
+ method: "POST",
464
+ headers: { "Content-Type": "application/json" },
465
+ body: JSON.stringify({
466
+ claim,
467
+ time_range: timeRange,
468
+ pledge_date: pledgeDate,
469
+ pledge_author: pledgeAuthor,
470
+ user_id: userId,
471
+ timestamp: timestamp,
472
+ signal: signal,
473
+ suggestion_meta: suggestedPledge
474
+ })
475
+ });
476
+
477
+ const runData = await runRes.json();
478
+
479
+ lastUsedFile = runData.file;
480
+ lastUserId = runData.user_id;
481
+ lastTimestamp = runData.timestamp;
482
+ } catch (err) {
483
+ if (err.name === "AbortError") {
484
+ console.log("Previous request aborted.");
485
+ checkBtn.disabled = false;
486
+ checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
487
+ return;
488
+ }
489
+ p.textContent = `❌ Failed to load timeline: ${err.message}`;
490
+ }
491
+
492
+ });
493
+
494
+
495
+ async function useSuggestedPledge(text, index) {
496
+ document.getElementById("claim").value = text;
497
+ document.getElementById("pledge-author").value = "Labour";
498
+ document.getElementById("pledge-date").value = "2024-07-04";
499
+ suggestedPledge = { text, index };
500
+ alert("✅ This pledge has been filled in. You can now click 'Let's fact check!'");
501
+ await fetch("/api/log-similar-selection", {
502
+ method: "POST",
503
+ headers: { "Content-Type": "application/json" },
504
+ body: JSON.stringify({
505
+ selected_text: text,
506
+ index: index
507
+ })
508
+ });
509
+ }
510
+
511
+ </script>
512
+ </body>
513
+ </html>