Spaces:
Sleeping
Sleeping
Commit
·
35b3f62
1
Parent(s):
10e50a5
add
Browse files- Dockerfile +10 -0
- README copy.md +13 -0
- app.py +360 -5
- requirements.txt +25 -0
- system/.DS_Store +0 -0
- system/__init__.py +0 -0
- system/__pycache__/augmented_searching.cpython-312.pyc +0 -0
- system/__pycache__/ee.cpython-312.pyc +0 -0
- system/__pycache__/generate_output.cpython-312.pyc +0 -0
- system/__pycache__/hero_pipeline.cpython-312.pyc +0 -0
- system/__pycache__/html2lines.cpython-312.pyc +0 -0
- system/__pycache__/initial_searching.cpython-312.pyc +0 -0
- system/__pycache__/process_time.cpython-312.pyc +0 -0
- system/__pycache__/scraper.cpython-312.pyc +0 -0
- system/augmented_searching.py +101 -0
- system/baseline/hyde_fc_generation_optimized.py +163 -0
- system/baseline/question_generation_optimized.py +244 -0
- system/baseline/reranking_optimized.py +230 -0
- system/baseline/retrieval_optimized.py +244 -0
- system/baseline/train.json +0 -0
- system/ee.py +98 -0
- system/existing_pledges.txt +54 -0
- system/generate_output.py +75 -0
- system/hero_QA.py +60 -0
- system/hero_pipeline.py +157 -0
- system/html2lines.py +105 -0
- system/icl.txt +69 -0
- system/initial_searching.py +101 -0
- system/instruction.txt +7 -0
- system/pledge_tracking.py +219 -0
- system/process_time.py +238 -0
- system/scraper.py +95 -0
- system/test.py +15 -0
- test.html +513 -0
Dockerfile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
COPY . /app
|
5 |
+
|
6 |
+
RUN pip install --no-cache-dir flask flask-cors pandas openpyxl
|
7 |
+
|
8 |
+
EXPOSE 7860
|
9 |
+
|
10 |
+
CMD ["python", "app.py"]
|
README copy.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Demo
|
3 |
+
emoji: 🔥
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.32.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,7 +1,362 @@
|
|
1 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
return "Hello " + name + "!!"
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, jsonify, send_file, request, send_from_directory
|
2 |
+
from flask_cors import CORS
|
3 |
+
import os, json, uuid, time
|
4 |
+
import pandas as pd
|
5 |
+
from datetime import datetime, timedelta
|
6 |
+
from huggingface_hub import HfApi
|
7 |
+
import sys
|
8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
+
from system.pledge_tracking import run_pipeline
|
11 |
+
from huggingface_hub import hf_hub_download
|
12 |
+
import spacy
|
13 |
+
import traceback
|
14 |
+
import threading
|
15 |
|
16 |
+
nlp = spacy.load("en_core_web_sm")
|
|
|
17 |
|
18 |
+
app = Flask(__name__, static_folder='.')
|
19 |
+
CORS(app)
|
20 |
+
|
21 |
+
HF_DATASET_REPO = "PledgeTracker/demo_feedback"
|
22 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
23 |
+
TMP_DIR = "tmp"
|
24 |
+
FEEDBACK_DIR = "feedback_logs"
|
25 |
+
os.makedirs(TMP_DIR, exist_ok=True)
|
26 |
+
os.makedirs(FEEDBACK_DIR, exist_ok=True)
|
27 |
+
|
28 |
+
REFERENCE_PLEDGES = []
|
29 |
+
|
30 |
+
REFERENCE_PLEDGE_PATH = hf_hub_download(
|
31 |
+
repo_id="PledgeTracker/demo_feedback",
|
32 |
+
filename="existing_pledges.txt",
|
33 |
+
repo_type="dataset",
|
34 |
+
token=os.environ["HF_TOKEN"]
|
35 |
+
)
|
36 |
+
|
37 |
+
if os.path.exists(REFERENCE_PLEDGE_PATH):
|
38 |
+
with open(REFERENCE_PLEDGE_PATH, "r") as f:
|
39 |
+
REFERENCE_PLEDGES = [line.strip() for line in f if line.strip()]
|
40 |
+
else:
|
41 |
+
print(f"Missing reference pledge file: {REFERENCE_PLEDGE_PATH}")
|
42 |
+
|
43 |
+
|
44 |
+
def lemmatize(text):
|
45 |
+
doc = nlp(text)
|
46 |
+
return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
|
47 |
+
|
48 |
+
|
49 |
+
@app.route("/api/similar-pledges", methods=["POST"])
|
50 |
+
def similar_pledges():
|
51 |
+
data = request.get_json()
|
52 |
+
claim = data.get("claim", "").strip()
|
53 |
+
if not claim or not REFERENCE_PLEDGES:
|
54 |
+
return jsonify({"suggestions": []})
|
55 |
+
|
56 |
+
all_pledges = [claim] + REFERENCE_PLEDGES
|
57 |
+
lemmatized_pledges = [lemmatize(p) for p in all_pledges]
|
58 |
+
|
59 |
+
vectorizer = TfidfVectorizer().fit_transform(lemmatized_pledges)
|
60 |
+
similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
|
61 |
+
filtered = [(i, similarities[i]) for i in range(len(similarities)) if similarities[i] > 0.3]
|
62 |
+
top_filtered = sorted(filtered, key=lambda x: x[1], reverse=True)[:5]
|
63 |
+
|
64 |
+
suggestions = [
|
65 |
+
{"text": REFERENCE_PLEDGES[i], "index": int(i)}
|
66 |
+
for i, score in top_filtered
|
67 |
+
]
|
68 |
+
|
69 |
+
return jsonify({"suggestions": suggestions})
|
70 |
+
|
71 |
+
|
72 |
+
def calculate_time_range(option: str, pledge_date: str = None):
|
73 |
+
today = datetime.today()
|
74 |
+
|
75 |
+
if option == "week":
|
76 |
+
one_week_ago = today - timedelta(days=7)
|
77 |
+
start = max(one_week_ago, pledge_date)
|
78 |
+
elif option == "month":
|
79 |
+
one_month_ago = today - timedelta(days=30)
|
80 |
+
start = max(one_month_ago, pledge_date)
|
81 |
+
elif option == "year":
|
82 |
+
one_year_ago = today - timedelta(days=365)
|
83 |
+
start = max(one_year_ago, pledge_date)
|
84 |
+
elif option == "since_pledge_date":
|
85 |
+
if not pledge_date:
|
86 |
+
raise ValueError("Pledge date is required for 'since_pledge_date' option")
|
87 |
+
start = datetime.strptime(pledge_date, "%Y-%m-%d")
|
88 |
+
else:
|
89 |
+
raise ValueError("Invalid time range option")
|
90 |
+
print(start)
|
91 |
+
return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")
|
92 |
+
|
93 |
+
@app.route("/")
|
94 |
+
def serve_html():
|
95 |
+
return send_from_directory('.', 'test.html')
|
96 |
+
|
97 |
+
@app.route("/api/status")
|
98 |
+
def check_status():
|
99 |
+
user_id = request.args.get("user_id")
|
100 |
+
timestamp = request.args.get("timestamp")
|
101 |
+
log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
|
102 |
+
if not os.path.exists(log_file_path):
|
103 |
+
return jsonify({"status": {}}), 200
|
104 |
+
try:
|
105 |
+
with open(log_file_path, "r") as f:
|
106 |
+
status = json.load(f)
|
107 |
+
except Exception:
|
108 |
+
status = {}
|
109 |
+
|
110 |
+
return jsonify({"status": status})
|
111 |
+
|
112 |
+
|
113 |
+
@app.route("/api/run-model", methods=["POST"])
|
114 |
+
def run_model():
|
115 |
+
data = request.get_json()
|
116 |
+
claim = data.get("claim", "no input")
|
117 |
+
time_range_option = data.get("time_range", "month")
|
118 |
+
|
119 |
+
suggestion_meta = data.get("suggestion_meta")
|
120 |
+
pledge_date = data.get("pledge_date", "")
|
121 |
+
pledge_author = data.get("pledge_author", "")
|
122 |
+
timestamp = data.get("timestamp") or time.strftime("%Y-%m-%d_%H-%M-%S")
|
123 |
+
user_id = data.get("user_id") or str(uuid.uuid4())[:8]
|
124 |
+
|
125 |
+
log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
|
126 |
+
|
127 |
+
status_lock = threading.Lock()
|
128 |
+
|
129 |
+
def update_status(step_id, msg):
|
130 |
+
print(f"[STATUS] Step {step_id}: {msg}")
|
131 |
+
with status_lock:
|
132 |
+
if os.path.exists(log_file_path):
|
133 |
+
try:
|
134 |
+
with open(log_file_path, "r") as f:
|
135 |
+
current = json.load(f)
|
136 |
+
except Exception:
|
137 |
+
current = {}
|
138 |
+
else:
|
139 |
+
current = {}
|
140 |
+
current[str(step_id)] += f": {msg}"
|
141 |
+
with open(log_file_path, "w") as f:
|
142 |
+
json.dump(current, f, indent=2)
|
143 |
+
|
144 |
+
try:
|
145 |
+
time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
|
146 |
+
print(f"[DEMO] Received claim: {claim}")
|
147 |
+
print(f"[DEMO] Time range: {time_start} ~ {time_end}")
|
148 |
+
print(f"[DEMO] Time range: {pledge_date}")
|
149 |
+
|
150 |
+
# user_id = str(uuid.uuid4())[:8]
|
151 |
+
# outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)
|
152 |
+
|
153 |
+
|
154 |
+
update_status(0, "📌 Starting the system ...")
|
155 |
+
print(suggestion_meta)
|
156 |
+
|
157 |
+
outputs = run_pipeline(
|
158 |
+
claim, pledge_date, pledge_author, time_start, timestamp, user_id,
|
159 |
+
update_fn=update_status, suggestion_meta=suggestion_meta
|
160 |
+
)
|
161 |
+
|
162 |
+
df = pd.read_excel(outputs["sorted_events"])
|
163 |
+
json_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}.json")
|
164 |
+
df.to_json(json_path, orient="records", indent=2)
|
165 |
+
|
166 |
+
events = df.to_dict(orient="records")
|
167 |
+
log_entry = {
|
168 |
+
"requested_time": timestamp,
|
169 |
+
"pledge": claim,
|
170 |
+
"suggestion_meta": suggestion_meta,
|
171 |
+
"user_id": user_id,
|
172 |
+
"pledge_author": pledge_author,
|
173 |
+
"pledge_date": pledge_date,
|
174 |
+
"events": events
|
175 |
+
}
|
176 |
+
default_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
|
177 |
+
|
178 |
+
with open(default_log_path, "w") as f:
|
179 |
+
f.write(json.dumps(log_entry, indent=1))
|
180 |
+
|
181 |
+
try:
|
182 |
+
api = HfApi()
|
183 |
+
api.upload_file(
|
184 |
+
path_or_fileobj=default_log_path,
|
185 |
+
path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
|
186 |
+
repo_id=HF_DATASET_REPO,
|
187 |
+
repo_type="dataset",
|
188 |
+
token=HF_TOKEN
|
189 |
+
)
|
190 |
+
update_status(7, "✅ done")
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
traceback.print_exc()
|
194 |
+
print(f"[Default Feedback Upload Error] {e}")
|
195 |
+
|
196 |
+
return jsonify({
|
197 |
+
"status": "success",
|
198 |
+
"file": f"{timestamp}_{user_id}.json",
|
199 |
+
"user_id": user_id,
|
200 |
+
"timestamp": timestamp
|
201 |
+
})
|
202 |
+
except Exception as e:
|
203 |
+
traceback.print_exc()
|
204 |
+
return jsonify({"status": "error", "detail": str(e)}), 500
|
205 |
+
|
206 |
+
@app.route("/api/events")
|
207 |
+
def get_events():
|
208 |
+
filename = request.args.get("file")
|
209 |
+
file_path = os.path.join(TMP_DIR, filename)
|
210 |
+
|
211 |
+
if not os.path.exists(file_path):
|
212 |
+
return jsonify({"error": "File not found"}), 404
|
213 |
+
|
214 |
+
with open(file_path, "r") as f:
|
215 |
+
events = json.load(f)
|
216 |
+
|
217 |
+
return jsonify(events)
|
218 |
+
|
219 |
+
|
220 |
+
@app.route("/api/feedback", methods=["POST"])
|
221 |
+
def receive_feedback():
|
222 |
+
data = request.get_json()
|
223 |
+
pledge = data.get("pledge", "no_pledge_text")
|
224 |
+
feedback_list = data.get("feedback", [])
|
225 |
+
filename = data.get("file")
|
226 |
+
file_path = os.path.join(TMP_DIR, filename)
|
227 |
+
pledge_date = data.get("pledge_date", "")
|
228 |
+
pledge_author = data.get("pledge_author", "")
|
229 |
+
|
230 |
+
if not os.path.exists(file_path):
|
231 |
+
return jsonify({"error": "Event file not found"}), 400
|
232 |
+
|
233 |
+
with open(file_path, "r") as f:
|
234 |
+
events = json.load(f)
|
235 |
+
|
236 |
+
# 直接存储反馈字符串值
|
237 |
+
feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list}
|
238 |
+
|
239 |
+
for idx, event in enumerate(events):
|
240 |
+
event["user_feedback"] = feedback_dict.get(idx)
|
241 |
+
|
242 |
+
log_entry = {
|
243 |
+
"requested_time": data.get("timestamp"),
|
244 |
+
"user_id": data.get("user_id"),
|
245 |
+
"pledge": pledge,
|
246 |
+
"pledge_author": pledge_author,
|
247 |
+
"pledge_date": pledge_date,
|
248 |
+
"events": events
|
249 |
+
}
|
250 |
+
|
251 |
+
timestamp = data.get("timestamp")
|
252 |
+
user_id = data.get("user_id")
|
253 |
+
|
254 |
+
if not user_id or not timestamp:
|
255 |
+
return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400
|
256 |
+
|
257 |
+
local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
|
258 |
+
|
259 |
+
with open(local_filename, "w") as f:
|
260 |
+
f.write(json.dumps(log_entry, indent=1))
|
261 |
+
|
262 |
+
try:
|
263 |
+
api = HfApi()
|
264 |
+
api.upload_file(
|
265 |
+
path_or_fileobj=local_filename,
|
266 |
+
path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
|
267 |
+
repo_id=HF_DATASET_REPO,
|
268 |
+
repo_type="dataset",
|
269 |
+
token=HF_TOKEN
|
270 |
+
)
|
271 |
+
except Exception as e:
|
272 |
+
return jsonify({'status': 'partial_success', 'error': str(e)}), 500
|
273 |
+
|
274 |
+
return jsonify({'status': 'success'})
|
275 |
+
|
276 |
+
# @app.route("/api/feedback", methods=["POST"])
|
277 |
+
# def receive_feedback():
|
278 |
+
# data = request.get_json()
|
279 |
+
# pledge = data.get("pledge", "no_pledge_text")
|
280 |
+
# feedback_list = data.get("feedback", [])
|
281 |
+
# filename = data.get("file")
|
282 |
+
# file_path = os.path.join(TMP_DIR, filename)
|
283 |
+
# pledge_date = data.get("pledge_date", "")
|
284 |
+
# pledge_author = data.get("pledge_author", "")
|
285 |
+
|
286 |
+
# if not os.path.exists(file_path):
|
287 |
+
# return jsonify({"error": "Event file not found"}), 400
|
288 |
+
|
289 |
+
# with open(file_path, "r") as f:
|
290 |
+
# events = json.load(f)
|
291 |
+
|
292 |
+
# feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list}
|
293 |
+
|
294 |
+
# for idx, event in enumerate(events):
|
295 |
+
# event["user_feedback"] = feedback_dict.get(idx)
|
296 |
+
|
297 |
+
# log_entry = {
|
298 |
+
# "requested_time": data.get("timestamp"),
|
299 |
+
# "user_id": data.get("user_id"),
|
300 |
+
# "pledge": pledge,
|
301 |
+
# "pledge_author": pledge_author,
|
302 |
+
# "pledge_date": pledge_date,
|
303 |
+
# "events": events
|
304 |
+
# }
|
305 |
+
|
306 |
+
# timestamp = data.get("timestamp")
|
307 |
+
# user_id = data.get("user_id")
|
308 |
+
# timestamp = data.get("timestamp")
|
309 |
+
|
310 |
+
# if not user_id or not timestamp:
|
311 |
+
# return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400
|
312 |
+
|
313 |
+
# local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
|
314 |
+
|
315 |
+
# with open(local_filename, "w") as f:
|
316 |
+
# f.write(json.dumps(log_entry, indent=1))
|
317 |
+
|
318 |
+
# try:
|
319 |
+
# api = HfApi()
|
320 |
+
# api.upload_file(
|
321 |
+
# path_or_fileobj=local_filename,
|
322 |
+
# path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
|
323 |
+
# repo_id=HF_DATASET_REPO,
|
324 |
+
# repo_type="dataset",
|
325 |
+
# token=HF_TOKEN
|
326 |
+
# )
|
327 |
+
# except Exception as e:
|
328 |
+
# return jsonify({'status': 'partial_success', 'error': str(e)}), 500
|
329 |
+
|
330 |
+
# return jsonify({'status': 'success'})
|
331 |
+
|
332 |
+
@app.route("/download-feedback/<filename>")
|
333 |
+
def download_feedback_file(filename):
|
334 |
+
return send_from_directory(FEEDBACK_DIR, filename, as_attachment=True)
|
335 |
+
|
336 |
+
@app.route("/feedback-files")
|
337 |
+
def list_feedback_files():
|
338 |
+
files = os.listdir(FEEDBACK_DIR)
|
339 |
+
return jsonify(sorted(files))
|
340 |
+
|
341 |
+
@app.route("/download")
|
342 |
+
def download_excel():
|
343 |
+
file = request.args.get("file")
|
344 |
+
if not file:
|
345 |
+
return "Missing file param", 400
|
346 |
+
|
347 |
+
json_path = os.path.join(TMP_DIR, file)
|
348 |
+
if not os.path.exists(json_path):
|
349 |
+
return "Event file not found", 404
|
350 |
+
|
351 |
+
with open(json_path, "r") as f:
|
352 |
+
data = json.load(f)
|
353 |
+
|
354 |
+
df = pd.DataFrame(data)
|
355 |
+
xlsx_path = os.path.join(TMP_DIR, file.replace(".json", ".xlsx"))
|
356 |
+
df.to_excel(xlsx_path, index=False)
|
357 |
+
|
358 |
+
return send_file(xlsx_path, as_attachment=True)
|
359 |
+
|
360 |
+
|
361 |
+
if __name__ == '__main__':
|
362 |
+
app.run(host="0.0.0.0", port=7860)
|
requirements.txt
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flask
|
2 |
+
flask_cors
|
3 |
+
pandas
|
4 |
+
openpyxl
|
5 |
+
huggingface_hub
|
6 |
+
PyMuPDF==1.23.25
|
7 |
+
huggingface_hub==0.30.2
|
8 |
+
lxml==5.3.1
|
9 |
+
nltk==3.9.1
|
10 |
+
numpy==2.2.6
|
11 |
+
openai==1.84.0
|
12 |
+
pandas==2.3.0
|
13 |
+
rank_bm25==0.2.2
|
14 |
+
Requests==2.32.3
|
15 |
+
scikit_learn==1.7.0
|
16 |
+
sentence_transformers==3.3.1
|
17 |
+
spacy==3.8.2
|
18 |
+
tiktoken==0.7.0
|
19 |
+
torch==2.6.0
|
20 |
+
tqdm
|
21 |
+
trafilatura==2.0.0
|
22 |
+
transformers==4.51.3
|
23 |
+
vllm==0.8.4
|
24 |
+
accelerate
|
25 |
+
|
system/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
system/__init__.py
ADDED
File without changes
|
system/__pycache__/augmented_searching.cpython-312.pyc
ADDED
Binary file (4.73 kB). View file
|
|
system/__pycache__/ee.cpython-312.pyc
ADDED
Binary file (4.71 kB). View file
|
|
system/__pycache__/generate_output.cpython-312.pyc
ADDED
Binary file (3.47 kB). View file
|
|
system/__pycache__/hero_pipeline.cpython-312.pyc
ADDED
Binary file (6.22 kB). View file
|
|
system/__pycache__/html2lines.cpython-312.pyc
ADDED
Binary file (3.15 kB). View file
|
|
system/__pycache__/initial_searching.cpython-312.pyc
ADDED
Binary file (5.25 kB). View file
|
|
system/__pycache__/process_time.cpython-312.pyc
ADDED
Binary file (8.95 kB). View file
|
|
system/__pycache__/scraper.cpython-312.pyc
ADDED
Binary file (4.76 kB). View file
|
|
system/augmented_searching.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import requests
|
5 |
+
import pandas as pd
|
6 |
+
from datetime import datetime
|
7 |
+
from pathlib import Path
|
8 |
+
import spacy
|
9 |
+
|
10 |
+
def google_search(query, api_key, search_engine_id, start_date, end_date):
|
11 |
+
print(f"[SYSTEM] Calling Google Search API for: {query}")
|
12 |
+
sort = f"date:r:{start_date}:{end_date}"
|
13 |
+
url = "https://www.googleapis.com/customsearch/v1"
|
14 |
+
params = {
|
15 |
+
"q": query,
|
16 |
+
"key": api_key,
|
17 |
+
"cx": search_engine_id,
|
18 |
+
"num": 10,
|
19 |
+
"sort": sort,
|
20 |
+
"cr": "countryUK",
|
21 |
+
"gl": "uk"
|
22 |
+
}
|
23 |
+
try:
|
24 |
+
response = requests.get(url, params=params)
|
25 |
+
response.raise_for_status()
|
26 |
+
return response.json().get("items", [])
|
27 |
+
except Exception as e:
|
28 |
+
print(f"[ERROR] Google Search Failed: {e}")
|
29 |
+
return []
|
30 |
+
|
31 |
+
def save_tsv(file_name, id_value, string_value, value_list, query):
|
32 |
+
|
33 |
+
data = {
|
34 |
+
'ID': id_value,
|
35 |
+
'String': string_value,
|
36 |
+
'ListValue': value_list,
|
37 |
+
'query': query
|
38 |
+
}
|
39 |
+
df = pd.DataFrame(data)
|
40 |
+
df.to_csv(file_name, sep='\t', index=False, header=False)
|
41 |
+
|
42 |
+
def ensure_directory_exists(path):
|
43 |
+
dir_path = Path(path).expanduser().resolve().parent
|
44 |
+
if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
|
45 |
+
raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
|
46 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
47 |
+
|
48 |
+
def run_augmented_searching(qa_file, pipeline_base_dir, suggestion_meta, pledge_author, start_date, end_date, user_id, claim_id):
|
49 |
+
if suggestion_meta==None:
|
50 |
+
qa_lines = open(f"{qa_file}","r").read()
|
51 |
+
qa_lines = json.loads(qa_lines)
|
52 |
+
claim_text = f"{pledge_author}: {qa_lines['claim']}"
|
53 |
+
else:
|
54 |
+
# claim_text = suggestion_meta["text"]
|
55 |
+
idx = suggestion_meta["index"]
|
56 |
+
qa_lines = open(f"{qa_file}","r").readlines()[idx]
|
57 |
+
qa_lines = json.loads(qa_lines)
|
58 |
+
claim_text = f"{qa_lines['claim']}"
|
59 |
+
print(qa_lines)
|
60 |
+
|
61 |
+
|
62 |
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
63 |
+
search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
|
64 |
+
if not api_key or not search_engine_id:
|
65 |
+
raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
|
66 |
+
|
67 |
+
# base_dir = pipeline_base_dir
|
68 |
+
|
69 |
+
tsv_file_path = os.path.join(pipeline_base_dir, "augmented_search_results.tsv")
|
70 |
+
ensure_directory_exists(tsv_file_path)
|
71 |
+
|
72 |
+
|
73 |
+
urls = []
|
74 |
+
string_values = []
|
75 |
+
queries = []
|
76 |
+
questions = []
|
77 |
+
questions = [evidence["question"] for evidence in qa_lines["evidence"] if evidence["question"] not in questions]
|
78 |
+
questions = questions[:10]
|
79 |
+
|
80 |
+
|
81 |
+
results = google_search(claim_text, api_key, search_engine_id, start_date, end_date)
|
82 |
+
print(results)
|
83 |
+
for result in results:
|
84 |
+
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
|
85 |
+
string_values.append("claim")
|
86 |
+
urls.append(result["link"])
|
87 |
+
queries.append(f"{pledge_author}: {claim_text}")
|
88 |
+
|
89 |
+
for question in questions:
|
90 |
+
results = google_search(f"{question}", api_key, search_engine_id, start_date, end_date)
|
91 |
+
for result in results:
|
92 |
+
if result["link"] not in urls and "fullfact.org/government-tracker" not in result["link"]:
|
93 |
+
string_values.append("question")
|
94 |
+
urls.append(result["link"])
|
95 |
+
queries.append(f"{question}")
|
96 |
+
|
97 |
+
urls = list(dict.fromkeys(urls))
|
98 |
+
|
99 |
+
save_tsv(str(tsv_file_path), [0] * len(urls), string_values, urls, queries)
|
100 |
+
print(f"[SYSTEM] Saved {len(urls)} URLs for claim {claim_id} to {tsv_file_path}")
|
101 |
+
return str(tsv_file_path)
|
system/baseline/hyde_fc_generation_optimized.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from vllm import LLM, SamplingParams
|
2 |
+
import json
|
3 |
+
import torch
|
4 |
+
import time
|
5 |
+
from datetime import datetime, timedelta
|
6 |
+
import argparse
|
7 |
+
from tqdm import tqdm
|
8 |
+
from typing import List, Dict, Any
|
9 |
+
import concurrent.futures
|
10 |
+
|
11 |
+
class VLLMGenerator:
|
12 |
+
def __init__(self, model_name: str, n: int = 8, max_tokens: int = 512,
|
13 |
+
temperature: float = 0.7, top_p: float = 1.0,
|
14 |
+
frequency_penalty: float = 0.0, presence_penalty: float = 0.0,
|
15 |
+
stop: List[str] = ['\n\n\n'], batch_size: int = 32):
|
16 |
+
self.device_count = torch.cuda.device_count()
|
17 |
+
print(f"Initializing with {self.device_count} GPUs")
|
18 |
+
self.llm = LLM(
|
19 |
+
model=model_name,
|
20 |
+
tensor_parallel_size=self.device_count,
|
21 |
+
max_model_len=4096,
|
22 |
+
gpu_memory_utilization=0.95,
|
23 |
+
enforce_eager=True,
|
24 |
+
trust_remote_code=True,
|
25 |
+
# quantization="bitsandbytes",
|
26 |
+
# dtype="half",
|
27 |
+
# load_format="bitsandbytes",
|
28 |
+
max_num_batched_tokens=4096,
|
29 |
+
max_num_seqs=batch_size
|
30 |
+
)
|
31 |
+
self.sampling_params = SamplingParams(
|
32 |
+
n=n,
|
33 |
+
max_tokens=max_tokens,
|
34 |
+
temperature=temperature,
|
35 |
+
top_p=top_p,
|
36 |
+
frequency_penalty=frequency_penalty,
|
37 |
+
presence_penalty=presence_penalty,
|
38 |
+
stop=stop,
|
39 |
+
logprobs=1
|
40 |
+
)
|
41 |
+
self.batch_size = batch_size
|
42 |
+
self.tokenizer = self.llm.get_tokenizer()
|
43 |
+
print(f"Initialization complete. Batch size: {batch_size}")
|
44 |
+
|
45 |
+
def parse_response(self, responses):
|
46 |
+
all_outputs = []
|
47 |
+
for response in responses:
|
48 |
+
to_return = []
|
49 |
+
for output in response.outputs:
|
50 |
+
text = output.text.strip()
|
51 |
+
try:
|
52 |
+
logprob = sum(logprob_obj.logprob for item in output.logprobs for logprob_obj in item.values())
|
53 |
+
except:
|
54 |
+
logprob = 0 # Fallback if logprobs aren't available
|
55 |
+
to_return.append((text, logprob))
|
56 |
+
texts = [r[0] for r in sorted(to_return, key=lambda tup: tup[1], reverse=True)]
|
57 |
+
all_outputs.append(texts)
|
58 |
+
return all_outputs
|
59 |
+
|
60 |
+
def prepare_prompt(self, claim: str, model_name: str) -> str:
|
61 |
+
base_prompt = f"Please write a fact-checking article passage to support, refute, indicate not enough evidence, or present conflicting evidence regarding the claim.\nClaim: {claim}"
|
62 |
+
|
63 |
+
if "OLMo" in model_name:
|
64 |
+
return base_prompt
|
65 |
+
else:
|
66 |
+
messages = [{"role": "user", "content": base_prompt}]
|
67 |
+
return self.tokenizer.apply_chat_template(messages, tokenize=False) + "<|start_header_id|>assistant<|end_header_id|>\n\nPassage: "
|
68 |
+
|
69 |
+
def process_batch(self, batch: List[Dict[str, Any]], model_name: str) -> tuple[List[Dict[str, Any]], float]:
|
70 |
+
start_time = time.time()
|
71 |
+
prompts = [self.prepare_prompt(example["claim"], model_name) for example in batch]
|
72 |
+
|
73 |
+
try:
|
74 |
+
results = self.llm.generate(prompts, sampling_params=self.sampling_params)
|
75 |
+
outputs = self.parse_response(results)
|
76 |
+
|
77 |
+
for example, output in zip(batch, outputs):
|
78 |
+
example['hypo_fc_docs'] = output
|
79 |
+
|
80 |
+
batch_time = time.time() - start_time
|
81 |
+
return batch, batch_time
|
82 |
+
except Exception as e:
|
83 |
+
print(f"Error processing batch: {str(e)}")
|
84 |
+
return batch, time.time() - start_time
|
85 |
+
|
86 |
+
# def format_time(seconds: float) -> str:
|
87 |
+
# return str(timedelta(seconds=int(seconds)))
|
88 |
+
|
89 |
+
# def estimate_completion_time(start_time: float, processed_examples: int, total_examples: int) -> str:
|
90 |
+
# elapsed_time = time.time() - start_time
|
91 |
+
# examples_per_second = processed_examples / elapsed_time
|
92 |
+
# remaining_examples = total_examples - processed_examples
|
93 |
+
# estimated_remaining_seconds = remaining_examples / examples_per_second
|
94 |
+
# completion_time = datetime.now() + timedelta(seconds=int(estimated_remaining_seconds))
|
95 |
+
# return completion_time.strftime("%Y-%m-%d %H:%M:%S")
|
96 |
+
|
97 |
+
def main(args):
|
98 |
+
total_start_time = time.time()
|
99 |
+
print(f"Script started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
100 |
+
|
101 |
+
# Load data
|
102 |
+
print("Loading data...")
|
103 |
+
with open(args.target_data, 'r', encoding='utf-8') as json_file:
|
104 |
+
examples = json.load(json_file)
|
105 |
+
print(f"Loaded {len(examples)} examples")
|
106 |
+
|
107 |
+
# Initialize generator
|
108 |
+
print("Initializing generator...")
|
109 |
+
generator = VLLMGenerator(
|
110 |
+
model_name=args.model,
|
111 |
+
batch_size=32
|
112 |
+
)
|
113 |
+
|
114 |
+
# Process data in batches
|
115 |
+
processed_data = []
|
116 |
+
# batch_times = []
|
117 |
+
batches = [examples[i:i + generator.batch_size] for i in range(0, len(examples), generator.batch_size)]
|
118 |
+
|
119 |
+
print(f"\nProcessing {len(batches)} batches...")
|
120 |
+
with tqdm(total=len(examples), desc="Processing examples") as pbar:
|
121 |
+
for batch_idx, batch in enumerate(batches, 1):
|
122 |
+
processed_batch, batch_time = generator.process_batch(batch, args.model)
|
123 |
+
processed_data.extend(processed_batch)
|
124 |
+
# batch_times.append(batch_time)
|
125 |
+
|
126 |
+
# Update progress and timing information
|
127 |
+
# examples_processed = len(processed_data)
|
128 |
+
# avg_batch_time = sum(batch_times) / len(batch_times)
|
129 |
+
# estimated_completion = estimate_completion_time(total_start_time, examples_processed, len(examples))
|
130 |
+
|
131 |
+
# pbar.set_postfix({
|
132 |
+
# 'Batch': f"{batch_idx}/{len(batches)}",
|
133 |
+
# 'Avg Batch Time': f"{avg_batch_time:.2f}s",
|
134 |
+
# 'ETA': estimated_completion
|
135 |
+
# })
|
136 |
+
# pbar.update(len(batch))
|
137 |
+
|
138 |
+
# Calculate and display timing statistics
|
139 |
+
# total_time = time.time() - total_start_time
|
140 |
+
# avg_batch_time = sum(batch_times) / len(batch_times)
|
141 |
+
# avg_example_time = total_time / len(examples)
|
142 |
+
|
143 |
+
# print("\nTiming Statistics:")
|
144 |
+
# print(f"Total Runtime: {format_time(total_time)}")
|
145 |
+
# print(f"Average Batch Time: {avg_batch_time:.2f} seconds")
|
146 |
+
# print(f"Average Time per Example: {avg_example_time:.2f} seconds")
|
147 |
+
# print(f"Throughput: {len(examples)/total_time:.2f} examples/second")
|
148 |
+
|
149 |
+
# Save results
|
150 |
+
# print("\nSaving results...")
|
151 |
+
with open(args.json_output, "w", encoding="utf-8") as output_json:
|
152 |
+
json.dump(processed_data, output_json, ensure_ascii=False, indent=4)
|
153 |
+
|
154 |
+
# print(f"Script completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
155 |
+
# print(f"Total runtime: {format_time(total_time)}")
|
156 |
+
|
157 |
+
if __name__ == "__main__":
|
158 |
+
parser = argparse.ArgumentParser()
|
159 |
+
parser.add_argument('-i', '--target_data', default='data_store/averitec/dev.json')
|
160 |
+
parser.add_argument('-o', '--json_output', default='data_store/hyde_fc.json')
|
161 |
+
parser.add_argument('-m', '--model', default="meta-llama/Llama-3.1-8B-Instruct")
|
162 |
+
args = parser.parse_args()
|
163 |
+
main(args)
|
system/baseline/question_generation_optimized.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import time
|
4 |
+
import json
|
5 |
+
import nltk
|
6 |
+
from rank_bm25 import BM25Okapi
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
from vllm import LLM, SamplingParams
|
10 |
+
from datetime import datetime, timedelta
|
11 |
+
from itertools import islice
|
12 |
+
|
13 |
+
|
14 |
+
def download_nltk_data(package_name, download_dir='nltk_data'):
|
15 |
+
# Ensure the download directory exists
|
16 |
+
os.makedirs(download_dir, exist_ok=True)
|
17 |
+
|
18 |
+
# Set NLTK data path
|
19 |
+
nltk.data.path.append(download_dir)
|
20 |
+
|
21 |
+
try:
|
22 |
+
# Try to find the resource
|
23 |
+
nltk.data.find(f'tokenizers/{package_name}')
|
24 |
+
print(f"Package '{package_name}' is already downloaded")
|
25 |
+
except LookupError:
|
26 |
+
# If resource isn't found, download it
|
27 |
+
print(f"Downloading {package_name}...")
|
28 |
+
nltk.download(package_name, download_dir=download_dir)
|
29 |
+
print(f"Successfully downloaded {package_name}")
|
30 |
+
|
31 |
+
# def format_time(seconds):
|
32 |
+
# """Format time duration nicely."""
|
33 |
+
# return str(timedelta(seconds=round(seconds)))
|
34 |
+
|
35 |
+
def claim2prompts(example):
|
36 |
+
claim = example["claim"]
|
37 |
+
claim_str = "Example [NUMBER]:||Claim: " + claim + "||Evidence: "
|
38 |
+
|
39 |
+
for question in example["questions"]:
|
40 |
+
q_text = question["question"].strip()
|
41 |
+
if len(q_text) == 0:
|
42 |
+
continue
|
43 |
+
|
44 |
+
if not q_text[-1] == "?":
|
45 |
+
q_text += "?"
|
46 |
+
|
47 |
+
answer_strings = []
|
48 |
+
|
49 |
+
for a in question["answers"]:
|
50 |
+
if a["answer_type"] in ["Extractive", "Abstractive"]:
|
51 |
+
answer_strings.append(a["answer"])
|
52 |
+
if a["answer_type"] == "Boolean":
|
53 |
+
answer_strings.append(a["answer"] + ", because " + a["boolean_explanation"].lower().strip())
|
54 |
+
|
55 |
+
for a_text in answer_strings:
|
56 |
+
if not a_text[-1] in [".", "!", ":", "?"]:
|
57 |
+
a_text += "."
|
58 |
+
|
59 |
+
prompt_lookup_str = a_text
|
60 |
+
this_q_claim_str = claim_str + a_text.strip() + "||Question: " + q_text
|
61 |
+
yield (prompt_lookup_str, this_q_claim_str.replace("\n", " ").replace("||", "\n")[:1500])
|
62 |
+
|
63 |
+
def main(args):
|
64 |
+
# script_start = time.time()
|
65 |
+
# start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
66 |
+
# print(f"Script started at: {start_time}")
|
67 |
+
# print(f"Loading model: {args.model}")
|
68 |
+
|
69 |
+
|
70 |
+
download_nltk_data('punkt')
|
71 |
+
download_nltk_data('punkt_tab')
|
72 |
+
|
73 |
+
# Load and prepare reference corpus
|
74 |
+
# corpus_start = time.time()
|
75 |
+
with open(args.reference_corpus, "r", encoding="utf-8") as json_file:
|
76 |
+
train_examples = json.load(json_file)
|
77 |
+
|
78 |
+
prompt_corpus, tokenized_corpus = [], []
|
79 |
+
for example in train_examples:
|
80 |
+
for lookup_str, prompt in claim2prompts(example):
|
81 |
+
entry = nltk.word_tokenize(lookup_str)
|
82 |
+
tokenized_corpus.append(entry)
|
83 |
+
prompt_corpus.append(prompt)
|
84 |
+
|
85 |
+
prompt_bm25 = BM25Okapi(tokenized_corpus)
|
86 |
+
# print(f"Reference corpus processed in: {format_time(time.time() - corpus_start)}")
|
87 |
+
|
88 |
+
# Initialize vLLM with optimized settings
|
89 |
+
gpu_count = torch.cuda.device_count()
|
90 |
+
print(f"Using {gpu_count} GPU{'s' if gpu_count > 1 else ''}")
|
91 |
+
|
92 |
+
# model_start = time.time()
|
93 |
+
llm = LLM(
|
94 |
+
model=args.model,
|
95 |
+
tensor_parallel_size=gpu_count,
|
96 |
+
max_model_len=4096,
|
97 |
+
gpu_memory_utilization=0.95,
|
98 |
+
enforce_eager=True,
|
99 |
+
trust_remote_code=True,
|
100 |
+
# dtype="half",
|
101 |
+
)
|
102 |
+
llm.get_tokenizer().pad_token = "<|end_of_text|>"
|
103 |
+
# print(f"Model loaded in: {format_time(time.time() - model_start)}")
|
104 |
+
|
105 |
+
sampling_params = SamplingParams(
|
106 |
+
temperature=0.6,
|
107 |
+
top_p=0.9,
|
108 |
+
top_k=1,
|
109 |
+
skip_special_tokens=False,
|
110 |
+
max_tokens=512,
|
111 |
+
stop=['<|end_of_text|>', '</s>', '<|im_end|>', '[INST]', '[/INST]','<|eot_id|>','<|end|>','<|endoftext|>']
|
112 |
+
)
|
113 |
+
|
114 |
+
# processing_start = time.time()
|
115 |
+
|
116 |
+
# Load target data
|
117 |
+
target_examples = []
|
118 |
+
with open(args.top_k_target_knowledge, "r", encoding="utf-8") as json_file:
|
119 |
+
for line in json_file:
|
120 |
+
target_examples.append(json.loads(line))
|
121 |
+
|
122 |
+
if args.end == -1:
|
123 |
+
args.end = len(target_examples)
|
124 |
+
print(f"Processing {args.end} examples")
|
125 |
+
|
126 |
+
# Process in batches
|
127 |
+
with torch.no_grad():
|
128 |
+
with open(args.output_questions, "w", encoding="utf-8") as output_file:
|
129 |
+
for idx in range(0, args.end, args.batch_size):
|
130 |
+
batch_end = min(idx + args.batch_size, args.end)
|
131 |
+
current_batch = target_examples[idx:batch_end]
|
132 |
+
print(f"\nProcessing batch {idx}-{batch_end}...")
|
133 |
+
|
134 |
+
for example in current_batch:
|
135 |
+
# batch_start = time.time()
|
136 |
+
claim = example["claim"]
|
137 |
+
claim_id = example["claim_id"]
|
138 |
+
top_k_sentences_urls = example[f"top_{args.top_k}"]
|
139 |
+
|
140 |
+
batch_prompts = []
|
141 |
+
batch_metadata = []
|
142 |
+
|
143 |
+
# Prepare all prompts for current example
|
144 |
+
for sentences_urls in top_k_sentences_urls:
|
145 |
+
prompt_lookup_str = sentences_urls["sentence"]
|
146 |
+
url = sentences_urls["url"]
|
147 |
+
|
148 |
+
prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
|
149 |
+
prompt_n = 10
|
150 |
+
prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
|
151 |
+
prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
|
152 |
+
|
153 |
+
temp_prompt = "\n\n".join(prompt_docs)
|
154 |
+
for k in range(1, temp_prompt.count("[NUMBER]")+1):
|
155 |
+
temp_prompt = temp_prompt.replace("[NUMBER]", f"{k}", 1)
|
156 |
+
|
157 |
+
claim_prompt = "Your task is to generate a question based on the given claim and evidence. The question should clarify the relationship between the evidence and the claim\n\n"
|
158 |
+
evidence = prompt_lookup_str.replace("\n", " ")
|
159 |
+
full_prompt = claim_prompt + temp_prompt + "\n\nNow, generate a question that links the following claim and evidence:" + f"\n\nClaim: {claim}" + f"\nEvidence: {evidence}"
|
160 |
+
|
161 |
+
if "OLMo" in args.model:
|
162 |
+
inputs = [full_prompt]
|
163 |
+
else:
|
164 |
+
messages = [{"role":"user", "content":full_prompt}]
|
165 |
+
inputs = llm.get_tokenizer().apply_chat_template(messages, tokenize=False)
|
166 |
+
inputs += "<|start_header_id|>assistant<|end_header_id|>\n\nQuestion: "
|
167 |
+
|
168 |
+
batch_prompts.append(inputs)
|
169 |
+
batch_metadata.append((url, prompt_lookup_str))
|
170 |
+
|
171 |
+
# Process batch
|
172 |
+
outputs = llm.generate(batch_prompts, sampling_params)
|
173 |
+
|
174 |
+
# Process outputs
|
175 |
+
evidence = []
|
176 |
+
for output, (url, sent) in zip(outputs, batch_metadata):
|
177 |
+
question = output.outputs[0].text.strip().split("?")[0].replace("\n", " ") + "?"
|
178 |
+
evidence.append({
|
179 |
+
"question": question,
|
180 |
+
"answer": sent,
|
181 |
+
"url": url
|
182 |
+
})
|
183 |
+
|
184 |
+
# Write results
|
185 |
+
json_data = {
|
186 |
+
"claim_id": claim_id,
|
187 |
+
"claim": claim,
|
188 |
+
"evidence": evidence
|
189 |
+
}
|
190 |
+
output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
191 |
+
output_file.flush()
|
192 |
+
|
193 |
+
# batch_time = time.time() - batch_start
|
194 |
+
# print(f"Processed example {claim_id}. Time elapsed: {batch_time:.2f}s")
|
195 |
+
|
196 |
+
# Calculate and display timing information
|
197 |
+
# total_time = time.time() - script_start
|
198 |
+
# processing_time = time.time() - processing_start
|
199 |
+
# end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
200 |
+
|
201 |
+
# print("\nTiming Summary:")
|
202 |
+
# print(f"Start time: {start_time}")
|
203 |
+
# print(f"End time: {end_time}")
|
204 |
+
# print(f"Total runtime: {format_time(total_time)}")
|
205 |
+
# print(f"Setup time: {format_time(processing_start - script_start)}")
|
206 |
+
# print(f"Processing time: {format_time(processing_time)}")
|
207 |
+
# print(f"Results written to: {args.output_questions}")
|
208 |
+
|
209 |
+
if __name__ == "__main__":
|
210 |
+
parser = argparse.ArgumentParser(description="Use a prompt to generate questions that could be answered by top-k retrieved evidence. Output generated questions.")
|
211 |
+
parser.add_argument("--model", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct")
|
212 |
+
parser.add_argument("--reference_corpus", default="baseline/train.json")
|
213 |
+
parser.add_argument(
|
214 |
+
"-i",
|
215 |
+
"--top_k_target_knowledge",
|
216 |
+
default="data_store/dev_reranking_top_k.json",
|
217 |
+
help="Directory where the sentences for the scraped data is saved.",
|
218 |
+
)
|
219 |
+
parser.add_argument(
|
220 |
+
"-o",
|
221 |
+
"--output_questions",
|
222 |
+
default="data_store/dev_top_k_qa.json",
|
223 |
+
help="Directory where the sentences for the scraped data is saved.",
|
224 |
+
)
|
225 |
+
parser.add_argument(
|
226 |
+
"--top_k",
|
227 |
+
default=10,
|
228 |
+
type=int
|
229 |
+
)
|
230 |
+
parser.add_argument(
|
231 |
+
"--batch_size",
|
232 |
+
type=int,
|
233 |
+
default=4,
|
234 |
+
help="Number of examples to process in each batch"
|
235 |
+
)
|
236 |
+
parser.add_argument(
|
237 |
+
"-e",
|
238 |
+
"--end",
|
239 |
+
type=int,
|
240 |
+
default=-1
|
241 |
+
)
|
242 |
+
|
243 |
+
args = parser.parse_args()
|
244 |
+
main(args)
|
system/baseline/reranking_optimized.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import gc
|
4 |
+
from transformers import AutoModel, AutoTokenizer
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
import numpy as np
|
7 |
+
import json
|
8 |
+
import argparse
|
9 |
+
import time
|
10 |
+
from datetime import datetime, timedelta
|
11 |
+
import re
|
12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
14 |
+
|
15 |
+
def encode_text(model, tokenizer, texts, batch_size=8, max_length=512):
|
16 |
+
"""Encode texts to embeddings using AutoModel"""
|
17 |
+
all_embeddings = []
|
18 |
+
|
19 |
+
for i in range(0, len(texts), batch_size):
|
20 |
+
batch = texts[i:i + batch_size]
|
21 |
+
|
22 |
+
# Tokenize
|
23 |
+
encoded_input = tokenizer(
|
24 |
+
batch,
|
25 |
+
padding=True,
|
26 |
+
truncation=True,
|
27 |
+
max_length=max_length,
|
28 |
+
return_tensors='pt'
|
29 |
+
).to(model.device)
|
30 |
+
|
31 |
+
# Compute token embeddings
|
32 |
+
with torch.no_grad():
|
33 |
+
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
34 |
+
model_output = model(**encoded_input)
|
35 |
+
# Use mean pooling
|
36 |
+
attention_mask = encoded_input['attention_mask']
|
37 |
+
token_embeddings = model_output[0] # First element contains token embeddings
|
38 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
39 |
+
embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
40 |
+
all_embeddings.append(embeddings.cpu().numpy())
|
41 |
+
|
42 |
+
# Clear some memory
|
43 |
+
if i % (batch_size * 4) == 0:
|
44 |
+
torch.cuda.empty_cache()
|
45 |
+
gc.collect()
|
46 |
+
|
47 |
+
return np.vstack(all_embeddings)
|
48 |
+
|
49 |
+
def compute_similarity(emb1, emb2):
|
50 |
+
"""Compute cosine similarity between embeddings"""
|
51 |
+
return np.dot(emb1, emb2.T) / (
|
52 |
+
np.linalg.norm(emb1, axis=1).reshape(-1, 1) *
|
53 |
+
np.linalg.norm(emb2, axis=1).reshape(1, -1)
|
54 |
+
)
|
55 |
+
|
56 |
+
def get_detailed_instruct(task_description: str, query: str) -> str:
|
57 |
+
return f'Instruct: {task_description}\nQuery: {query}'
|
58 |
+
|
59 |
+
def preprocess_sentences(sentence1, sentence2):
|
60 |
+
vectorizer = TfidfVectorizer().fit_transform([sentence1, sentence2])
|
61 |
+
vectors = vectorizer.toarray()
|
62 |
+
|
63 |
+
cosine_sim = cosine_similarity(vectors)
|
64 |
+
similarity_score = cosine_sim[0][1]
|
65 |
+
return similarity_score
|
66 |
+
|
67 |
+
def remove_trailing_special_chars(text):
|
68 |
+
return re.sub(r'[\W_]+$', '', text)
|
69 |
+
|
70 |
+
def remove_special_chars_except_spaces(text):
|
71 |
+
return re.sub(r'[^\w\s]+', '', text)
|
72 |
+
|
73 |
+
def select_top_k(claim, results, top_k):
|
74 |
+
'''
|
75 |
+
remove sentence of similarity claim
|
76 |
+
'''
|
77 |
+
dup_check = set()
|
78 |
+
top_k_sentences_urls = []
|
79 |
+
|
80 |
+
i = 0
|
81 |
+
print(results)
|
82 |
+
claim = remove_special_chars_except_spaces(claim).lower()
|
83 |
+
while len(top_k_sentences_urls) < top_k and i < len(results):
|
84 |
+
print(i)
|
85 |
+
sentence = remove_special_chars_except_spaces(results[i]['sentence']).lower()
|
86 |
+
|
87 |
+
if sentence not in dup_check:
|
88 |
+
if preprocess_sentences(claim, sentence) > 0.97:
|
89 |
+
dup_check.add(sentence)
|
90 |
+
continue
|
91 |
+
|
92 |
+
if claim in sentence:
|
93 |
+
if len(claim) / len(sentence) > 0.92:
|
94 |
+
dup_check.add(sentence)
|
95 |
+
continue
|
96 |
+
|
97 |
+
top_k_sentences_urls.append({
|
98 |
+
'sentence': results[i]['sentence'],
|
99 |
+
'url': results[i]['url']}
|
100 |
+
)
|
101 |
+
i += 1
|
102 |
+
|
103 |
+
return top_k_sentences_urls
|
104 |
+
|
105 |
+
# def format_time(seconds):
|
106 |
+
# """Format time duration nicely."""
|
107 |
+
# return str(timedelta(seconds=round(seconds)))
|
108 |
+
|
109 |
+
|
110 |
+
def compute_embeddings_batched(model, texts, batch_size=8):
|
111 |
+
"""Compute embeddings in smaller batches to manage memory"""
|
112 |
+
all_embeddings = []
|
113 |
+
for i in range(0, len(texts), batch_size):
|
114 |
+
batch = texts[i:i + batch_size]
|
115 |
+
with torch.cuda.amp.autocast(dtype=torch.bfloat16): # Use bfloat16
|
116 |
+
emb = model.encode(batch, batch_size=len(batch), show_progress_bar=False)
|
117 |
+
all_embeddings.append(emb)
|
118 |
+
|
119 |
+
# Clear some memory
|
120 |
+
if i % (batch_size * 4) == 0:
|
121 |
+
torch.cuda.empty_cache()
|
122 |
+
gc.collect()
|
123 |
+
|
124 |
+
return np.vstack(all_embeddings)
|
125 |
+
|
126 |
+
def main(args):
|
127 |
+
|
128 |
+
|
129 |
+
device = "cuda" if torch.cuda.is_available() else 'cpu'
|
130 |
+
print(f"Using device: {device}")
|
131 |
+
|
132 |
+
# Load model and tokenizer
|
133 |
+
model = AutoModel.from_pretrained(
|
134 |
+
"Salesforce/SFR-Embedding-2_R",
|
135 |
+
torch_dtype=torch.bfloat16,
|
136 |
+
low_cpu_mem_usage=True,
|
137 |
+
device_map="auto"
|
138 |
+
)
|
139 |
+
tokenizer = AutoTokenizer.from_pretrained("Salesforce/SFR-Embedding-2_R")
|
140 |
+
|
141 |
+
# Load target examples
|
142 |
+
target_examples = []
|
143 |
+
with open(args.target_data, "r", encoding="utf-8") as json_file:
|
144 |
+
for i, line in enumerate(json_file):
|
145 |
+
try:
|
146 |
+
example = json.loads(r"{}".format(line))
|
147 |
+
target_examples.append(example)
|
148 |
+
except:
|
149 |
+
print(f"CURRENT LINE broken {i}")
|
150 |
+
|
151 |
+
if args.end == -1:
|
152 |
+
args.end = len(target_examples)
|
153 |
+
|
154 |
+
files_to_process = list(range(args.start, args.end))
|
155 |
+
total = len(files_to_process)
|
156 |
+
|
157 |
+
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
158 |
+
|
159 |
+
with open(args.json_output, "w", encoding="utf-8") as output_json:
|
160 |
+
done = 0
|
161 |
+
for idx, example in enumerate(target_examples):
|
162 |
+
if idx in files_to_process:
|
163 |
+
print(f"Processing claim {example['claim_id']}... Progress: {done + 1} / {total}")
|
164 |
+
|
165 |
+
claim = example['claim']
|
166 |
+
query = [get_detailed_instruct(task, claim)] + [
|
167 |
+
get_detailed_instruct(task, le)
|
168 |
+
for le in example['hypo_fc_docs']
|
169 |
+
if len(le.strip()) > 0
|
170 |
+
]
|
171 |
+
query_length = len(query)
|
172 |
+
sentences = [sent['sentence'] for sent in example[f'top_{5000}']][:args.retrieved_top_k]
|
173 |
+
|
174 |
+
# st = time.time()
|
175 |
+
try:
|
176 |
+
# Process query embeddings
|
177 |
+
query_embeddings = encode_text(model, tokenizer, query, batch_size=4)
|
178 |
+
avg_emb_q = np.mean(query_embeddings, axis=0)
|
179 |
+
hyde_vector = avg_emb_q.reshape((1, -1))
|
180 |
+
|
181 |
+
# Process sentence embeddings in smaller chunks
|
182 |
+
sentence_embeddings = encode_text(
|
183 |
+
model,
|
184 |
+
tokenizer,
|
185 |
+
sentences,
|
186 |
+
batch_size=args.batch_size
|
187 |
+
)
|
188 |
+
|
189 |
+
# Compute similarities in chunks to save memory
|
190 |
+
chunk_size = 1000
|
191 |
+
all_scores = []
|
192 |
+
for i in range(0, len(sentence_embeddings), chunk_size):
|
193 |
+
chunk = sentence_embeddings[i:i + chunk_size]
|
194 |
+
chunk_scores = compute_similarity(hyde_vector, chunk)[0]
|
195 |
+
all_scores.extend(chunk_scores)
|
196 |
+
|
197 |
+
scores = np.array(all_scores)
|
198 |
+
top_k_idx = np.argsort(scores)[::-1]
|
199 |
+
results = [example['top_5000'][i] for i in top_k_idx]
|
200 |
+
top_k_sentences_urls = select_top_k(claim, results, args.top_k)
|
201 |
+
|
202 |
+
# print(f"Top {args.top_k} retrieved. Time elapsed: {time.time() - st:.2f}s")
|
203 |
+
|
204 |
+
json_data = {
|
205 |
+
"claim_id": example['claim_id'],
|
206 |
+
"claim": claim,
|
207 |
+
f"top_{args.top_k}": top_k_sentences_urls
|
208 |
+
}
|
209 |
+
output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
210 |
+
output_json.flush()
|
211 |
+
|
212 |
+
except RuntimeError as e:
|
213 |
+
print(f"Error processing claim {example['claim_id']}: {e}")
|
214 |
+
continue
|
215 |
+
|
216 |
+
done += 1
|
217 |
+
|
218 |
+
|
219 |
+
if __name__ == "__main__":
|
220 |
+
parser = argparse.ArgumentParser()
|
221 |
+
parser.add_argument("--target_data", default="data_store/dev_retrieval_top_k.json")
|
222 |
+
parser.add_argument("--retrieved_top_k", type=int, default=5000)
|
223 |
+
parser.add_argument("--top_k", type=int, default=10)
|
224 |
+
parser.add_argument("-o", "--json_output", type=str, default="data_store/dev_reranking_top_k.json")
|
225 |
+
parser.add_argument("--batch_size", type=int, default=32)
|
226 |
+
parser.add_argument("-s", "--start", type=int, default=0)
|
227 |
+
parser.add_argument("-e", "--end", type=int, default=-1)
|
228 |
+
args = parser.parse_args()
|
229 |
+
|
230 |
+
main(args)
|
system/baseline/retrieval_optimized.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
import nltk
|
8 |
+
from rank_bm25 import BM25Okapi
|
9 |
+
from multiprocessing import Pool, cpu_count, Manager, Lock
|
10 |
+
from functools import partial
|
11 |
+
import heapq
|
12 |
+
from threading import Thread, Event
|
13 |
+
import queue
|
14 |
+
from datetime import datetime, timedelta
|
15 |
+
|
16 |
+
|
17 |
+
def download_nltk_data(package_name, download_dir='nltk_data'):
|
18 |
+
# Ensure the download directory exists
|
19 |
+
os.makedirs(download_dir, exist_ok=True)
|
20 |
+
|
21 |
+
# Set NLTK data path
|
22 |
+
nltk.data.path.append(download_dir)
|
23 |
+
|
24 |
+
try:
|
25 |
+
# Try to find the resource
|
26 |
+
nltk.data.find(f'tokenizers/{package_name}')
|
27 |
+
print(f"Package '{package_name}' is already downloaded")
|
28 |
+
except LookupError:
|
29 |
+
# If resource isn't found, download it
|
30 |
+
print(f"Downloading {package_name}...")
|
31 |
+
nltk.download(package_name, download_dir=download_dir)
|
32 |
+
print(f"Successfully downloaded {package_name}")
|
33 |
+
|
34 |
+
|
35 |
+
def combine_all_sentences(knowledge_file):
|
36 |
+
sentences, urls = [], []
|
37 |
+
|
38 |
+
with open(knowledge_file, "r", encoding="utf-8") as json_file:
|
39 |
+
for i, line in enumerate(json_file):
|
40 |
+
data = json.loads(line)
|
41 |
+
sentences.extend(data["url2text"])
|
42 |
+
urls.extend([data["url"] for _ in range(len(data["url2text"]))])
|
43 |
+
return sentences, urls, i + 1
|
44 |
+
|
45 |
+
def remove_duplicates(sentences, urls):
|
46 |
+
df = pd.DataFrame({"document_in_sentences":sentences, "sentence_urls":urls})
|
47 |
+
df['sentences'] = df['document_in_sentences'].str.strip().str.lower()
|
48 |
+
df = df.drop_duplicates(subset="sentences").reset_index()
|
49 |
+
return df['document_in_sentences'].tolist(), df['sentence_urls'].tolist()
|
50 |
+
|
51 |
+
def retrieve_top_k_sentences(query, document, urls, top_k):
|
52 |
+
tokenized_docs = [nltk.word_tokenize(doc) for doc in document[:top_k]]
|
53 |
+
bm25 = BM25Okapi(tokenized_docs)
|
54 |
+
|
55 |
+
scores = bm25.get_scores(nltk.word_tokenize(query))
|
56 |
+
top_k_idx = np.argsort(scores)[::-1][:top_k]
|
57 |
+
|
58 |
+
return [document[i] for i in top_k_idx], [urls[i] for i in top_k_idx]
|
59 |
+
|
60 |
+
def process_single_example(idx, example, args, result_queue, counter, lock):
|
61 |
+
try:
|
62 |
+
with lock:
|
63 |
+
current_count = counter.value + 1
|
64 |
+
counter.value = current_count
|
65 |
+
print(f"\nProcessing claim {idx}... Progress: {current_count} / {args.total_examples}")
|
66 |
+
|
67 |
+
# start_time = time.time()
|
68 |
+
|
69 |
+
document_in_sentences, sentence_urls, num_urls_this_claim = combine_all_sentences(
|
70 |
+
os.path.join(args.knowledge_store_dir, f"{idx}.jsonl")
|
71 |
+
)
|
72 |
+
|
73 |
+
print(f"Obtained {len(document_in_sentences)} sentences from {num_urls_this_claim} urls.")
|
74 |
+
|
75 |
+
document_in_sentences, sentence_urls = remove_duplicates(document_in_sentences, sentence_urls)
|
76 |
+
|
77 |
+
query = example["claim"] + " " + " ".join(example['hypo_fc_docs'])
|
78 |
+
top_k_sentences, top_k_urls = retrieve_top_k_sentences(
|
79 |
+
query, document_in_sentences, sentence_urls, args.top_k
|
80 |
+
)
|
81 |
+
|
82 |
+
|
83 |
+
result = {
|
84 |
+
"claim_id": idx,
|
85 |
+
"claim": example["claim"],
|
86 |
+
f"top_{args.top_k}": [
|
87 |
+
{"sentence": sent, "url": url}
|
88 |
+
for sent, url in zip(top_k_sentences, top_k_urls)
|
89 |
+
],
|
90 |
+
"hypo_fc_docs": example['hypo_fc_docs']
|
91 |
+
}
|
92 |
+
|
93 |
+
result_queue.put((idx, result))
|
94 |
+
return True
|
95 |
+
except Exception as e:
|
96 |
+
print(f"Error processing example {idx}: {str(e)}")
|
97 |
+
result_queue.put((idx, None))
|
98 |
+
return False
|
99 |
+
|
100 |
+
def writer_thread(output_file, result_queue, total_examples, stop_event):
|
101 |
+
next_index = 0
|
102 |
+
pending_results = []
|
103 |
+
|
104 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
105 |
+
while not (stop_event.is_set() and result_queue.empty()):
|
106 |
+
try:
|
107 |
+
idx, result = result_queue.get(timeout=1)
|
108 |
+
|
109 |
+
if result is not None:
|
110 |
+
heapq.heappush(pending_results, (idx, result))
|
111 |
+
|
112 |
+
while pending_results and pending_results[0][0] == next_index:
|
113 |
+
_, result_to_write = heapq.heappop(pending_results)
|
114 |
+
f.write(json.dumps(result_to_write, ensure_ascii=False) + "\n")
|
115 |
+
f.flush()
|
116 |
+
next_index += 1
|
117 |
+
|
118 |
+
except queue.Empty:
|
119 |
+
continue
|
120 |
+
|
121 |
+
# def format_time(seconds):
|
122 |
+
# """Format time duration nicely."""
|
123 |
+
# return str(timedelta(seconds=round(seconds)))
|
124 |
+
|
125 |
+
def main(args):
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
download_nltk_data('punkt')
|
130 |
+
download_nltk_data('punkt_tab')
|
131 |
+
|
132 |
+
with open(args.target_data, "r", encoding="utf-8") as json_file:
|
133 |
+
target_examples = json.load(json_file)
|
134 |
+
|
135 |
+
if args.end == -1:
|
136 |
+
args.end = len(target_examples)
|
137 |
+
|
138 |
+
print(f"Total examples to process: {args.end - args.start}")
|
139 |
+
|
140 |
+
files_to_process = list(range(args.start, args.end))
|
141 |
+
examples_to_process = [(idx, target_examples[idx]) for idx in files_to_process]
|
142 |
+
|
143 |
+
num_workers = min(args.workers if args.workers > 0 else cpu_count(), len(files_to_process))
|
144 |
+
print(f"Using {num_workers} workers to process {len(files_to_process)} examples")
|
145 |
+
|
146 |
+
with Manager() as manager:
|
147 |
+
counter = manager.Value('i', 0)
|
148 |
+
lock = manager.Lock()
|
149 |
+
args.total_examples = len(files_to_process)
|
150 |
+
|
151 |
+
result_queue = manager.Queue()
|
152 |
+
|
153 |
+
stop_event = Event()
|
154 |
+
writer = Thread(
|
155 |
+
target=writer_thread,
|
156 |
+
args=(args.json_output, result_queue, len(files_to_process), stop_event)
|
157 |
+
)
|
158 |
+
writer.start()
|
159 |
+
|
160 |
+
process_func = partial(
|
161 |
+
process_single_example,
|
162 |
+
args=args,
|
163 |
+
result_queue=result_queue,
|
164 |
+
counter=counter,
|
165 |
+
lock=lock
|
166 |
+
)
|
167 |
+
|
168 |
+
with Pool(num_workers) as pool:
|
169 |
+
results = pool.starmap(process_func, examples_to_process)
|
170 |
+
|
171 |
+
stop_event.set()
|
172 |
+
writer.join()
|
173 |
+
|
174 |
+
# successful = sum(1 for r in results if r)
|
175 |
+
# print(f"\nSuccessfully processed {successful} out of {len(files_to_process)} examples")
|
176 |
+
# print(f"Results written to {args.json_output}")
|
177 |
+
|
178 |
+
# # Calculate and display timing information
|
179 |
+
# total_time = time.time() - script_start
|
180 |
+
# avg_time = total_time / len(files_to_process)
|
181 |
+
# end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
182 |
+
|
183 |
+
# print("\nTiming Summary:")
|
184 |
+
# print(f"Start time: {start_time}")
|
185 |
+
# print(f"End time: {end_time}")
|
186 |
+
# print(f"Total runtime: {format_time(total_time)} (HH:MM:SS)")
|
187 |
+
# print(f"Average time per example: {avg_time:.2f} seconds")
|
188 |
+
# if successful > 0:
|
189 |
+
# print(f"Processing speed: {successful / total_time:.2f} examples per second")
|
190 |
+
|
191 |
+
if __name__ == "__main__":
|
192 |
+
parser = argparse.ArgumentParser(
|
193 |
+
description="Get top 10000 sentences with BM25 in the knowledge store using parallel processing."
|
194 |
+
)
|
195 |
+
parser.add_argument(
|
196 |
+
"-k",
|
197 |
+
"--knowledge_store_dir",
|
198 |
+
type=str,
|
199 |
+
default="data_store/knowledge_store",
|
200 |
+
help="The path of the knowledge_store_dir containing json files with all the retrieved sentences.",
|
201 |
+
)
|
202 |
+
parser.add_argument(
|
203 |
+
"--target_data",
|
204 |
+
type=str,
|
205 |
+
default="data_store/hyde_fc.json",
|
206 |
+
help="The path of the file that stores the claim.",
|
207 |
+
)
|
208 |
+
parser.add_argument(
|
209 |
+
"-o",
|
210 |
+
"--json_output",
|
211 |
+
type=str,
|
212 |
+
default="data_store/dev_retrieval_top_k.json",
|
213 |
+
help="The output dir for JSON files to save the top 100 sentences for each claim.",
|
214 |
+
)
|
215 |
+
parser.add_argument(
|
216 |
+
"--top_k",
|
217 |
+
default=5000,
|
218 |
+
type=int,
|
219 |
+
help="How many documents should we pick out with BM25.",
|
220 |
+
)
|
221 |
+
parser.add_argument(
|
222 |
+
"-s",
|
223 |
+
"--start",
|
224 |
+
type=int,
|
225 |
+
default=0,
|
226 |
+
help="Starting index of the files to process.",
|
227 |
+
)
|
228 |
+
parser.add_argument(
|
229 |
+
"-e",
|
230 |
+
"--end",
|
231 |
+
type=int,
|
232 |
+
default=-1,
|
233 |
+
help="End index of the files to process.",
|
234 |
+
)
|
235 |
+
parser.add_argument(
|
236 |
+
"-w",
|
237 |
+
"--workers",
|
238 |
+
type=int,
|
239 |
+
default=0,
|
240 |
+
help="Number of worker processes (default: number of CPU cores)",
|
241 |
+
)
|
242 |
+
|
243 |
+
args = parser.parse_args()
|
244 |
+
main(args)
|
system/baseline/train.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
system/ee.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import argparse
|
4 |
+
from tqdm import tqdm
|
5 |
+
import tiktoken
|
6 |
+
from openai import OpenAI
|
7 |
+
|
8 |
+
def gpt_4o(input_text):
|
9 |
+
client=OpenAI(api_key=os.environ.get("OAI"))
|
10 |
+
response = client.chat.completions.create(
|
11 |
+
model="gpt-4o",
|
12 |
+
messages=[
|
13 |
+
{"role": "user", "content": [{"type": "text", "text": input_text}]}
|
14 |
+
],
|
15 |
+
response_format={"type": "json_object"},
|
16 |
+
temperature=0,
|
17 |
+
max_tokens=4096,
|
18 |
+
top_p=0,
|
19 |
+
frequency_penalty=0,
|
20 |
+
presence_penalty=0
|
21 |
+
)
|
22 |
+
return response.choices[0].message.content
|
23 |
+
|
24 |
+
def run_gpt4_event_extraction(data_dir, icl_path, max_tokens=100000):
|
25 |
+
|
26 |
+
all_info_path = os.path.join(data_dir, "all_info_with_txt.json")
|
27 |
+
output_dir = os.path.join(data_dir, "gpt4_event_extraction")
|
28 |
+
os.makedirs(output_dir, exist_ok=True)
|
29 |
+
|
30 |
+
ICL = open(icl_path, "r").read()
|
31 |
+
all_info = open(all_info_path, "r").readlines()
|
32 |
+
|
33 |
+
enc = tiktoken.encoding_for_model("gpt-4o")
|
34 |
+
|
35 |
+
for i, line in enumerate(all_info):
|
36 |
+
ID = i
|
37 |
+
urls = []
|
38 |
+
results = []
|
39 |
+
|
40 |
+
data = json.loads(line)
|
41 |
+
docs = data["evidence"]
|
42 |
+
claim = data["claim"]
|
43 |
+
|
44 |
+
output_path = os.path.join(output_dir, f"gpt4o_results_{ID}_claim.json")
|
45 |
+
if os.path.exists(output_path):
|
46 |
+
print(f"输出已存在 {output_path}")
|
47 |
+
|
48 |
+
else:
|
49 |
+
|
50 |
+
for doc in tqdm(docs):
|
51 |
+
if doc["url"] in urls:
|
52 |
+
continue
|
53 |
+
|
54 |
+
text = " ".join(doc["text"])
|
55 |
+
input_text = (
|
56 |
+
f"{ICL}\nInput:\n\nTitle: {doc['metadata']['title']}\n"
|
57 |
+
f"Date: {doc['metadata']['date']}\nArticle: {text}\n\n"
|
58 |
+
f"Please only summarize events that are useful for verifying the claim '{claim}', and their dates in the JSON format.\n\nOutput:\n"
|
59 |
+
)
|
60 |
+
|
61 |
+
urls.append(doc["url"])
|
62 |
+
text_tokens = enc.encode(input_text)
|
63 |
+
if len(text_tokens) > max_tokens:
|
64 |
+
input_text = enc.decode(text_tokens[:max_tokens])
|
65 |
+
|
66 |
+
try:
|
67 |
+
output = gpt_4o(input_text)
|
68 |
+
print(f"GPT-4o Response: {output}")
|
69 |
+
results.append({
|
70 |
+
"url": doc["url"],
|
71 |
+
"title": doc["metadata"]["title"],
|
72 |
+
"date": doc["metadata"]["date"],
|
73 |
+
"article": text,
|
74 |
+
"output": json.loads(output)
|
75 |
+
})
|
76 |
+
except Exception as e:
|
77 |
+
print(f"Error processing doc: {e}")
|
78 |
+
continue
|
79 |
+
|
80 |
+
|
81 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
82 |
+
json.dump(results, f, ensure_ascii=False, indent=4)
|
83 |
+
|
84 |
+
return output_path
|
85 |
+
|
86 |
+
if __name__ == "__main__":
|
87 |
+
parser = argparse.ArgumentParser(description="Run GPT-4o event extraction")
|
88 |
+
parser.add_argument("--data_dir", type=str, required=True, help="Root data directory")
|
89 |
+
parser.add_argument("--icl_path", type=str, required=True, help="Path to ICL prompt file")
|
90 |
+
parser.add_argument("--max_tokens", type=int, default=100000, help="Maximum token limit for input")
|
91 |
+
|
92 |
+
args = parser.parse_args()
|
93 |
+
|
94 |
+
run_gpt4_event_extraction(
|
95 |
+
base_dir=args.base_dir,
|
96 |
+
icl_path=args.icl_path,
|
97 |
+
max_tokens=args.max_tokens
|
98 |
+
)
|
system/existing_pledges.txt
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Take back our streets by halving serious violent crime
|
2 |
+
We will support families with children by introducing free breakfast clubs in every primary school
|
3 |
+
We will finally deliver a full trans-inclusive ban on conversion practices
|
4 |
+
We will introduce a ‘Hillsborough Law’ which will place a legal duty of candour on public servants and authorities, and provide legal aid for victims of disasters or state-related deaths
|
5 |
+
As a first step, in England, we will deliver an extra two million NHS operations, scans, and appointments every year; that is 40,000 more appointments every week
|
6 |
+
We will end the use of offshore trusts to avoid inheritance tax so that everyone who makes their home here in the UK pays their taxes here
|
7 |
+
We will abolish non-dom status once and for all, replacing it with a modern scheme for people genuinely in the country for a short period
|
8 |
+
We will end the VAT exemption and business rates relief for private schools
|
9 |
+
We will get Britain building again … with 1.5 million new homes over the next parliament
|
10 |
+
We will ensure the next generation can never legally buy cigarettes
|
11 |
+
We will not increase taxes on working people
|
12 |
+
We will not increase taxes on working people, which is why we will not increase National Insurance
|
13 |
+
We will not increase taxes on working people, which is why we will not increase [...] the basic, higher, or additional rates of Income Tax
|
14 |
+
We will not increase taxes on working people, which is why we will not increase [...] VAT
|
15 |
+
We will intervene earlier to stop young people being drawn into crime, creating a new Young Futures programme with a network of hubs reaching every community
|
16 |
+
Raising confidence in the … criminal justice system to its highest levels
|
17 |
+
We will recruit an additional 8,500 new staff to treat children and adults through our first term
|
18 |
+
Kickstart economic growth to secure the highest sustained growth in the G7
|
19 |
+
Raising confidence in the police … to its highest levels
|
20 |
+
We will [introduce] new Respect Orders - powers to ban persistent adult offenders from town centres, which will stamp out issues such as public drinking and drug use
|
21 |
+
We will create a new Border Security Command, with hundreds of new investigators, intelligence officers, and cross-border police officers
|
22 |
+
Capitalised with £7.3 billion over the course of the next Parliament, the National Wealth Fund will have a remit to support We’s growth and clean energy missions
|
23 |
+
We will introduce a new participation requirement [for House of Lords members]
|
24 |
+
The next We government will therefore bring about an immediate modernisation, by introducing legislation to remove the right of hereditary peers to sit and vote in the House of Lords
|
25 |
+
A new Energy Independence Act will establish the framework for We’s energy and climate policies
|
26 |
+
We will introduce a Football Governance Bill, which will establish an independent regulator to ensure financial sustainability of football clubs in England
|
27 |
+
Every fiscal event making significant changes to taxation or spending will be subject to an independent OBR forecast
|
28 |
+
We will establish a National Wealth Fund
|
29 |
+
We will conduct a Strategic Defence Review within our first year in government
|
30 |
+
Ending the wasteful Migration and Economic Development partnership with Rwanda
|
31 |
+
We will cap corporation tax at the current level of 25%, the lowest in the G7, for the entire parliament
|
32 |
+
We will introduce a new ‘Fit For the Future’ fund to double the number of CT and MRI scanners, allowing the NHS to catch cancer and other conditions earlier, saving lives
|
33 |
+
We will … [give] 16- and 17-year-olds the right to vote in all elections
|
34 |
+
We will set up a new returns and enforcement unit, with an additional 1,000 staff, to fast-track removals to safe countries for people who do not have the right to stay here
|
35 |
+
We will capitalise Great British Energy with £8.3 billion, over the next parliament
|
36 |
+
We will immediately update the National Policy Planning Framework [sic] to undo damaging Conservative changes, including restoring mandatory housing targets
|
37 |
+
Recruit 6,500 new expert teachers in key subjects
|
38 |
+
We will carry out a review of sentencing to ensure it is brought up to date
|
39 |
+
We will train thousands more GPs
|
40 |
+
We will return to meeting NHS performance standards. That means patients should expect to wait no longer than 18 weeks from referral for consultant-led treatment of non-urgent health conditions
|
41 |
+
Productivity growth in every part of the country
|
42 |
+
The government will deliver a milestone of higher living standards in every part of the United Kingdom by the end of the Parliament
|
43 |
+
Giving children the best start in life, with a record 75% of 5-year-olds in England ready to learn when they start school
|
44 |
+
We will fix an additional one million potholes across England in each year of the next parliament
|
45 |
+
We will not grant new coal licences
|
46 |
+
We will set out the path to spending 2.5 per cent of GDP on defence
|
47 |
+
We will [...] address the inconsistencies in voter ID rules that prevent legitimate voters from voting. For example, in the case of HM Armed Forces Veteran Cards
|
48 |
+
We will also introduce a mandatory retirement age. At the end of the Parliament in which a member reaches 80 years of age, they will be required to retire from the House of Lords
|
49 |
+
We will create a new publicly-owned company, Great British Energy
|
50 |
+
We will negotiate additional returns arrangements to speed up returns
|
51 |
+
We will not issue new licences to explore new [oil and gas] fields
|
52 |
+
We will deliver our commitment to spend 2.5% of GDP on defence, but we will bring it forward so that we reach that level in 2027 and we will maintain that for the rest of this Parliament
|
53 |
+
We will tackle the immediate crisis with a rescue plan to provide 700,000 more urgent dental appointments
|
54 |
+
We will … end asylum hotels, saving the taxpayer billions of pounds
|
system/generate_output.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import argparse
|
4 |
+
from system.html2lines import html2metadata
|
5 |
+
from lxml.etree import tostring
|
6 |
+
import lxml.etree
|
7 |
+
|
8 |
+
def process_manifesto_data_with_metadata(input_base_dir: str):
|
9 |
+
|
10 |
+
input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json")
|
11 |
+
output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json")
|
12 |
+
|
13 |
+
url2text_dir = os.path.join(input_base_dir, "augmented_data_store")
|
14 |
+
|
15 |
+
with open(input_file_path, "r", encoding="utf-8") as f:
|
16 |
+
input_file = f.readlines()
|
17 |
+
|
18 |
+
out_file = open(output_file_path, "w", encoding="utf-8")
|
19 |
+
|
20 |
+
|
21 |
+
i = 0
|
22 |
+
|
23 |
+
for id, line in enumerate(input_file):
|
24 |
+
line = json.loads(line)
|
25 |
+
claim = line["claim"]
|
26 |
+
QAs = line["top_50"]
|
27 |
+
new_line = {"claim": claim, "evidence": []}
|
28 |
+
|
29 |
+
json_path = os.path.join(url2text_dir, f"{id}.jsonl")
|
30 |
+
if not os.path.exists(json_path):
|
31 |
+
print(f"Warning: {json_path} not found")
|
32 |
+
continue
|
33 |
+
|
34 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
35 |
+
try:
|
36 |
+
data_store = json.load(f)
|
37 |
+
except json.JSONDecodeError:
|
38 |
+
f.seek(0)
|
39 |
+
data_store = [json.loads(line) for line in f]
|
40 |
+
|
41 |
+
url_txt = {data["url"]: data["url2text"] for data in data_store}
|
42 |
+
|
43 |
+
URLs = []
|
44 |
+
for j, QA in enumerate(QAs):
|
45 |
+
newQA = QA.copy()
|
46 |
+
URL = QA["url"]
|
47 |
+
newQA["text"] = url_txt.get(URL, "")
|
48 |
+
|
49 |
+
if URL not in URLs:
|
50 |
+
try:
|
51 |
+
meta = html2metadata(URL)
|
52 |
+
if isinstance(meta, lxml.etree._Element):
|
53 |
+
meta = tostring(meta, encoding="unicode", pretty_print=True)
|
54 |
+
meta_save = {
|
55 |
+
"title": meta["title"],
|
56 |
+
"date": meta["date"]
|
57 |
+
}
|
58 |
+
except Exception as e:
|
59 |
+
print(f"Metadata extraction failed for URL: {URL}, error: {e}")
|
60 |
+
meta_save = {
|
61 |
+
"title": "",
|
62 |
+
"date": ""
|
63 |
+
}
|
64 |
+
|
65 |
+
|
66 |
+
newQA["metadata"] = meta_save
|
67 |
+
new_line["evidence"].append(newQA)
|
68 |
+
|
69 |
+
out_file.write(json.dumps(new_line) + "\n")
|
70 |
+
|
71 |
+
out_file.close()
|
72 |
+
return output_file_path
|
73 |
+
|
74 |
+
|
75 |
+
|
system/hero_QA.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import datetime
|
3 |
+
import subprocess
|
4 |
+
|
5 |
+
|
6 |
+
def run_hero_reranking(user_id, end_date):
|
7 |
+
base_dir = f"outputs/{user_id}_{end_date}"
|
8 |
+
hero_dir = os.path.join(base_dir, "hero")
|
9 |
+
os.makedirs(hero_dir, exist_ok=True)
|
10 |
+
|
11 |
+
hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
|
12 |
+
|
13 |
+
def safe_run(cmd, timeout=600):
|
14 |
+
try:
|
15 |
+
print(f"👉 Running: {' '.join(cmd)}")
|
16 |
+
subprocess.run(cmd, check=True, timeout=timeout)
|
17 |
+
except subprocess.CalledProcessError as e:
|
18 |
+
print(f"[❌ ERROR] Subprocess failed: {e}")
|
19 |
+
if e.stderr:
|
20 |
+
print("[stderr]:", e.stderr.decode())
|
21 |
+
raise
|
22 |
+
except subprocess.TimeoutExpired:
|
23 |
+
print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
|
24 |
+
raise
|
25 |
+
|
26 |
+
# Step 3.2: retrieval
|
27 |
+
print("🔍 Step 3.2: Retrieval from knowledge store ...")
|
28 |
+
knowledge_store_dir = os.path.join(base_dir, "augmented_data_store")
|
29 |
+
retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k_QA.json")
|
30 |
+
|
31 |
+
if not os.path.exists(retrieval_output):
|
32 |
+
safe_run([
|
33 |
+
"python3.12", "baseline/retrieval_optimized.py",
|
34 |
+
"--knowledge_store_dir", knowledge_store_dir,
|
35 |
+
"--target_data", hyde_output,
|
36 |
+
"--json_output", retrieval_output
|
37 |
+
])
|
38 |
+
|
39 |
+
# Step 3.3: reranking
|
40 |
+
print("🏷️ Step 3.3: Reranking retrieved evidence ...")
|
41 |
+
rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k_QA.json")
|
42 |
+
|
43 |
+
if not os.path.exists(rerank_output):
|
44 |
+
safe_run([
|
45 |
+
"python3.12", "baseline/reranking_optimized.py",
|
46 |
+
"--target_data", retrieval_output,
|
47 |
+
"--json_output", rerank_output
|
48 |
+
])
|
49 |
+
|
50 |
+
return {
|
51 |
+
"hyde": hyde_output,
|
52 |
+
"retrieved": retrieval_output,
|
53 |
+
"reranked": rerank_output,
|
54 |
+
}
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
output_files = run_step3_hero_pipeline(user_id="xxx", end_date="20250604")
|
59 |
+
for key, path in output_files.items():
|
60 |
+
print(f"✅ {key}: {path}")
|
system/hero_pipeline.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from datetime import datetime
|
3 |
+
import subprocess
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
+
import json
|
6 |
+
|
7 |
+
def run_hero_reranking(pipeline_base_dir, suggestion_meta):
|
8 |
+
base_dir = f"{pipeline_base_dir}"
|
9 |
+
hero_dir = os.path.join(base_dir, "hero")
|
10 |
+
os.makedirs(hero_dir, exist_ok=True)
|
11 |
+
|
12 |
+
if suggestion_meta:
|
13 |
+
hyde_path = hf_hub_download(
|
14 |
+
repo_id="PledgeTracker/demo_feedback",
|
15 |
+
filename="manifesto_icl_hyde_fc.json",
|
16 |
+
repo_type="dataset",
|
17 |
+
token=os.environ["HF_TOKEN"]
|
18 |
+
)
|
19 |
+
with open(hyde_path, "r", encoding="utf-8") as f:
|
20 |
+
all_hyde_data = json.load(f)
|
21 |
+
|
22 |
+
idx = suggestion_meta["index"]
|
23 |
+
single_hyde = [all_hyde_data[idx]]
|
24 |
+
save_path = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
|
25 |
+
with open(save_path, "w", encoding="utf-8") as f:
|
26 |
+
json.dump(single_hyde, f, indent=2)
|
27 |
+
|
28 |
+
hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
|
29 |
+
|
30 |
+
def safe_run(cmd, timeout=600):
|
31 |
+
try:
|
32 |
+
print(f"👉 Running: {' '.join(str(x) for x in cmd)}")
|
33 |
+
subprocess.run(cmd, check=True, timeout=timeout)
|
34 |
+
except subprocess.CalledProcessError as e:
|
35 |
+
print(f"[❌ ERROR] Subprocess failed: {e}")
|
36 |
+
if e.stderr:
|
37 |
+
print("[stderr]:", e.stderr.decode())
|
38 |
+
raise
|
39 |
+
except subprocess.TimeoutExpired:
|
40 |
+
print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
|
41 |
+
raise
|
42 |
+
|
43 |
+
# Step 3.2: retrieval
|
44 |
+
print("🔍 Step 3.2: Retrieval from knowledge store ...")
|
45 |
+
knowledge_store_dir = os.path.join(base_dir, "augmented_data_store")
|
46 |
+
retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k_QA.json")
|
47 |
+
|
48 |
+
if not os.path.exists(retrieval_output):
|
49 |
+
safe_run([
|
50 |
+
"python", "system/baseline/retrieval_optimized.py",
|
51 |
+
"--knowledge_store_dir", knowledge_store_dir,
|
52 |
+
"--target_data", hyde_output,
|
53 |
+
"--json_output", retrieval_output,
|
54 |
+
])
|
55 |
+
|
56 |
+
# Step 3.3: reranking
|
57 |
+
print("🏷️ Step 3.3: Reranking retrieved evidence ...")
|
58 |
+
rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k_QA.json")
|
59 |
+
|
60 |
+
if not os.path.exists(rerank_output):
|
61 |
+
safe_run([
|
62 |
+
"python", "system/baseline/reranking_optimized.py",
|
63 |
+
"--target_data", retrieval_output,
|
64 |
+
"--json_output", rerank_output,
|
65 |
+
"--top_k", str(50),
|
66 |
+
])
|
67 |
+
|
68 |
+
return {
|
69 |
+
"hyde": hyde_output,
|
70 |
+
"retrieved": retrieval_output,
|
71 |
+
"reranked": rerank_output,
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
def run_hero_pipeline(pipeline_base_dir):
|
76 |
+
base_dir = f"{pipeline_base_dir}"
|
77 |
+
hero_dir = os.path.join(base_dir, "hero")
|
78 |
+
os.makedirs(hero_dir, exist_ok=True)
|
79 |
+
|
80 |
+
target_data = os.path.join(base_dir, "claim.json")
|
81 |
+
hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
|
82 |
+
|
83 |
+
def safe_run(cmd, timeout=600):
|
84 |
+
try:
|
85 |
+
print(f"👉 Running: {' '.join(cmd)}")
|
86 |
+
subprocess.run(cmd, check=True, timeout=timeout)
|
87 |
+
except subprocess.CalledProcessError as e:
|
88 |
+
print(f"[❌ ERROR] Subprocess failed: {e}")
|
89 |
+
if e.stderr:
|
90 |
+
print("[stderr]:", e.stderr.decode())
|
91 |
+
raise
|
92 |
+
except subprocess.TimeoutExpired:
|
93 |
+
print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
|
94 |
+
raise
|
95 |
+
|
96 |
+
# Step 3.1: hyde_fc_generation
|
97 |
+
if not os.path.exists(hyde_output):
|
98 |
+
print("🧠 Step 3.1: HyDE ICL generation ...")
|
99 |
+
safe_run([
|
100 |
+
"python", "system/baseline/hyde_fc_generation_optimized.py",
|
101 |
+
"--target_data", target_data,
|
102 |
+
"--json_output", hyde_output
|
103 |
+
])
|
104 |
+
|
105 |
+
# Step 3.2: retrieval
|
106 |
+
print("🔍 Step 3.2: Retrieval from knowledge store ...")
|
107 |
+
knowledge_store_dir = os.path.join(base_dir, "initial_data_store")
|
108 |
+
retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k.json")
|
109 |
+
|
110 |
+
if not os.path.exists(retrieval_output):
|
111 |
+
safe_run([
|
112 |
+
"python", "system/baseline/retrieval_optimized.py",
|
113 |
+
"--knowledge_store_dir", knowledge_store_dir,
|
114 |
+
"--target_data", hyde_output,
|
115 |
+
"--json_output", retrieval_output
|
116 |
+
])
|
117 |
+
|
118 |
+
# Step 3.3: reranking
|
119 |
+
print("🏷️ Step 3.3: Reranking retrieved evidence ...")
|
120 |
+
rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k.json")
|
121 |
+
|
122 |
+
if not os.path.exists(rerank_output):
|
123 |
+
safe_run([
|
124 |
+
"python", "system/baseline/reranking_optimized.py",
|
125 |
+
"--target_data", retrieval_output,
|
126 |
+
"--json_output", rerank_output
|
127 |
+
])
|
128 |
+
|
129 |
+
# Step 3.4: question generation
|
130 |
+
print("❓ Step 3.4: Generating QA pairs ...")
|
131 |
+
reference_corpus = "system/baseline/train.json"
|
132 |
+
qa_output = os.path.join(hero_dir, "manifesto_icl_top_k_qa.json")
|
133 |
+
|
134 |
+
if not os.path.exists(qa_output):
|
135 |
+
safe_run([
|
136 |
+
"python", "system/baseline/question_generation_optimized.py",
|
137 |
+
"--reference_corpus", reference_corpus,
|
138 |
+
"--top_k_target_knowledge", rerank_output,
|
139 |
+
"--output_questions", qa_output,
|
140 |
+
"--model", "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
141 |
+
])
|
142 |
+
|
143 |
+
return {
|
144 |
+
"hyde": hyde_output,
|
145 |
+
"retrieved": retrieval_output,
|
146 |
+
"reranked": rerank_output,
|
147 |
+
"qa_pairs": qa_output
|
148 |
+
}
|
149 |
+
|
150 |
+
|
151 |
+
if __name__ == "__main__":
|
152 |
+
user_id="xxx"
|
153 |
+
end_date="20250604"
|
154 |
+
pipeline_base_dir = f"{user_id}_{end_date}"
|
155 |
+
output_files = run_step3_hero_pipeline(pipeline_base_dir)
|
156 |
+
for key, path in output_files.items():
|
157 |
+
print(f"✅ {key}: {path}")
|
system/html2lines.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from time import sleep
|
3 |
+
import trafilatura
|
4 |
+
from trafilatura.meta import reset_caches
|
5 |
+
from trafilatura.settings import DEFAULT_CONFIG
|
6 |
+
import spacy
|
7 |
+
from lxml.etree import tostring
|
8 |
+
import lxml.etree
|
9 |
+
|
10 |
+
|
11 |
+
import spacy
|
12 |
+
import subprocess
|
13 |
+
|
14 |
+
try:
|
15 |
+
nlp = spacy.load("en_core_web_lg")
|
16 |
+
except OSError:
|
17 |
+
print("🔁 Downloading spaCy model 'en_core_web_lg' ...")
|
18 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
|
19 |
+
nlp = spacy.load("en_core_web_lg")
|
20 |
+
|
21 |
+
|
22 |
+
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
|
23 |
+
MIN_CHAR = 50
|
24 |
+
MAX_CHAR = 5000
|
25 |
+
|
26 |
+
|
27 |
+
def get_page(url):
|
28 |
+
page = None
|
29 |
+
for _ in range(3):
|
30 |
+
try:
|
31 |
+
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
|
32 |
+
assert page is not None
|
33 |
+
print("Fetched " + url, file=sys.stderr)
|
34 |
+
break
|
35 |
+
except:
|
36 |
+
sleep(3)
|
37 |
+
return page
|
38 |
+
|
39 |
+
|
40 |
+
def url2lines(url):
|
41 |
+
page = get_page(url)
|
42 |
+
|
43 |
+
if page is None:
|
44 |
+
return []
|
45 |
+
|
46 |
+
lines = html2lines(page)
|
47 |
+
return lines
|
48 |
+
|
49 |
+
|
50 |
+
def line_correction(lines, max_size=100):
|
51 |
+
out_lines = []
|
52 |
+
for line in lines:
|
53 |
+
if len(line) < MIN_CHAR:
|
54 |
+
continue
|
55 |
+
|
56 |
+
if len(line) > max_size:
|
57 |
+
doc = nlp(
|
58 |
+
line[:MAX_CHAR]
|
59 |
+
) # We split lines into sentences, but for performance we take only the first 5k characters per line
|
60 |
+
stack = ""
|
61 |
+
for sent in doc.sents:
|
62 |
+
if len(stack) > 0:
|
63 |
+
stack += " "
|
64 |
+
stack += str(sent).strip()
|
65 |
+
if len(stack) > max_size:
|
66 |
+
out_lines.append(stack)
|
67 |
+
stack = ""
|
68 |
+
|
69 |
+
if (
|
70 |
+
len(stack) > MIN_CHAR
|
71 |
+
): # Ensure every lines in the out_lines suffice the MIN_CHAR restriction
|
72 |
+
out_lines.append(stack)
|
73 |
+
else:
|
74 |
+
out_lines.append(line)
|
75 |
+
|
76 |
+
return out_lines
|
77 |
+
|
78 |
+
|
79 |
+
def html2lines(page):
|
80 |
+
out_lines = []
|
81 |
+
|
82 |
+
if len(page.strip()) == 0 or page is None:
|
83 |
+
return out_lines
|
84 |
+
|
85 |
+
text = trafilatura.extract(page, config=DEFAULT_CONFIG)
|
86 |
+
reset_caches()
|
87 |
+
|
88 |
+
if text is None:
|
89 |
+
return out_lines
|
90 |
+
|
91 |
+
return text.split(
|
92 |
+
"\n"
|
93 |
+
) # We just spit out the entire page, so need to reformat later.
|
94 |
+
|
95 |
+
|
96 |
+
def html2metadata(url):
|
97 |
+
page = get_page(url)
|
98 |
+
metadata = trafilatura.extract_metadata(page)
|
99 |
+
return metadata.as_dict()
|
100 |
+
|
101 |
+
if __name__ == "__main__":
|
102 |
+
url = "https://www.bbc.co.uk/news/61407508"
|
103 |
+
metadata = html2metadata(url)
|
104 |
+
text = " ".join(html2lines(page))
|
105 |
+
print(metadata)
|
system/icl.txt
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Input:
|
2 |
+
|
3 |
+
Title: New investment for Border Security Command
|
4 |
+
Date: 2024-09-17
|
5 |
+
Article: Up to £75 million in new investment for the Border Security Command paves way for an autumn immigration crime crackdown. The UK’s Border Security Command will deliver cutting edge new technology, extra officers and further covert capabilities across the system following a significant, immediate cash injection, Home Secretary Yvette Cooper announced today. As part of the new Border Security Command uplift, the National Crime Agency (NCA), the police and other law enforcement agency partners will receive a significant cash injection to bolster the UK’s border security and disrupt the criminal people smuggling gangs. The investment comes ahead of an expected effort by the smuggling gangs to cram ever more vulnerable people into unseaworthy boats launched from the French coast while the weather remains fair. Their industrial scale smuggling business is under sustained pressure from co-ordinated UK and European partner law enforcement action. The Home Secretary announced the package of up to £75 million, which redirects funds originally allocated to the previous government’s Illegal Migration Act. It will unlock sophisticated new technology and extra capabilities for the NCA to bolster UK border security and disrupt the criminal people smuggling gangs. The investment is designed to build on a pattern of successful upstream disruptions announced at an operational summit, attended by the Prime Minister, at the NCA headquarters last week. - covert cameras and state of the art monitoring technology, enhancing evidence collection, speeding up investigations and increasing the likelihood of successful prosecutions - establishing a new unit to improve intelligence collection across UK police forces and information flows to partners, alongside an uplift in prosecutors working in the Crown Prosecution Service to act on investigations to swiftly bring those responsible to justice - recruitment of additional personnel for the new Border Security Command, led by Commander Martin Hewitt, which will oversee the co-operation of all of the organisations involved in smashing the gangs - increased work to tackle organised crime groups facilitating irregular migration upstream by intensifying efforts in transit countries to prevent small boat equipment reaching the French coast The announcement follows yesterday’s meeting between the Prime Minister and his Italian counterpart, Giorgia Meloni, in Rome to discuss systematic bilateral co-operation on border security. Italy has seen a significant drop in irregular migration thanks to tougher enforcement and enhanced cooperation with international partners. Newly appointed Border Security Commander – a director general senior civil servant appointment – Martin Hewitt joined the UK delegation to Rome. The enhanced technical and staffing resources announced today will be an important platform for the work he will co-ordinate across UK law enforcement and intelligence agencies when he formally starts his role in the coming weeks. The funding also covers an additional 100 specialist investigators for the NCA, which was announced by the government last month, representing a 25% increase in the agency’s dedicated personnel tackling organised immigration crime. The government has also announced a 50% increase in the number of British officers stationed at Europol, supporting European operations to dismantle organised crime groups facilitating people smuggling. Criminal gangs are getting away with undermining our border security and putting lives at risk. The Border Security Command will deliver a major overhaul and upgrade in law enforcement against smugglers and trafficking gangs to boost our border security. State of the art technology and enhanced intelligence capabilities will ensure we are using every tool at our disposal to dismantle this vile trade. I welcome this funding, which will allow us to improve and extend our technology, data exploitation, and capacity-building both internationally and in the UK. Tackling organised immigration crime remains a top priority for the NCA, we are currently leading around 70 investigations into the gangs or individuals involved in the highest echelons of criminality, and we are devoting more resources to it than ever before. We are determined to do all we can to disrupt and dismantle these networks, wherever they operate. CPS Director of Public Prosecutions Stephen Parkinson said: CPS prosecutors will bring significant expertise to the new unit to help stop human trafficking gangs in their tracks, and pursue any assets gained through criminality. Working with partners, we will continue to discourage, disrupt and dismantle this exploitative trade through prosecutions and cross-border collaboration. The announcement coincides with a concerted push by UK ministers to tackle shared border security challenges. Immigration Minister Dame Angela Eagle is attending the annual Berlin Process Interior Ministers’ meeting in Germany today (Tuesday 17 September), to discuss strengthening border security, tackling organised crime groups and combatting violence against women and girls across the Western Balkans region. The meeting brings together European partners with a focus on working with partners across the Western Balkans, a key region in the journey of irregular migrants through Europe and, in many cases, onwards to the UK.
|
6 |
+
|
7 |
+
Summary events and their dates in the JSON format.
|
8 |
+
|
9 |
+
Output:
|
10 |
+
|
11 |
+
{
|
12 |
+
"events":[
|
13 |
+
{
|
14 |
+
"event": "Announcement of up to £75 million in new investment for the UK's Border Security Command by Home Secretary Yvette Cooper.",
|
15 |
+
"date": "2024-09-17"
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"event": "Immigration Minister Dame Angela Eagle attending the annual Berlin Process Interior Ministers’ meeting in Germany to discuss strengthening border security.",
|
19 |
+
"date": "2024-09-17"
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"event": "An operational summit attended by the Prime Minister at the NCA headquarters where successful upstream disruptions were announced.",
|
23 |
+
"date": "Last week (relative to 2024-09-17)"
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"event": "A meeting took place in Rome between the UK Prime Minister and Italian Prime Minister Giorgia Meloni to discuss systematic bilateral cooperation on border security.",
|
27 |
+
"date": "Yesterday (relative to 2024-09-17)"
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"event": "The government announced an additional 100 specialist investigators for the NCA, representing a 25% increase in dedicated personnel tackling organised immigration crime.",
|
31 |
+
"date": "Last month (relative to 2024-09-17)"
|
32 |
+
}
|
33 |
+
]
|
34 |
+
}
|
35 |
+
|
36 |
+
Input:
|
37 |
+
|
38 |
+
Title: Home Secretary announces new measures to boost Britain’s border security
|
39 |
+
Date: 2024-08-21
|
40 |
+
Article: Home Secretary announces new measures to boost Britain’s border security Home Secretary announces new measures to strengthen border security, enforce immigration rules and increase returns. New measures to boost Britain’s border security are being set out today (21 August) by the Home Secretary, including the immediate recruitment of up to 100 new specialist intelligence and investigation officers at the National Crime Agency (NCA) to target, dismantle and disrupt organised immigration crime networks. Yvette Cooper has also today announced a major surge in immigration enforcement and returns activity, to make sure that immigration and asylum rules are respected and enforced - saying that the government has new plans for the next 6 months to achieve the highest rate of removals of those with no right to be here, including failed asylum seekers, for 5 years (since 2018). In addition, a new intelligence-driven illegal working programme will be rolled out to target, investigate and take down unscrupulous employers who illegally employ those with no right to work here. The new measures are fulfilling on the government’s commitment to provide long-term security to our borders. - up to 100 new specialist intelligence and investigations officers deployed to the National Crime Agency (NCA) to disrupt and smash criminal smuggling gangs and prevent dangerous boat crossings - a large surge in enforcement and returns flights, with the aim of putting removals at their highest level since 2018, reversing the damaging drop in enforcement over recent years - increased detention capacity including 290 added beds at Campsfield and Haslar Immigration Removal Centres - redeployment of staff to drive this increase in returns - sanctions to be taken against unscrupulous employers who hire workers illegally This comes on top of the 50% uplift in the number of NCA officers stationed in Europol. These officers have been immediately deployed to support European operations to disrupt the activity of criminal smuggling gangs making millions out of small boat crossings. The NCA currently has around 70 investigations targeting the highest harm criminal networks involved in people smuggling and trafficking, and worked with international partners to support the seizure of around 400 boats and engines intended for use in channel crossings. A range of sanctions, including financial penalty notices, business closure orders and potential prosecution, will be taken against those employing illegal workers. Those caught working illegally and eligible for removal will be detained, pending their swift removal. Alongside this, the government is increasing detention spaces to support the higher pace of removals including reopening and adding 290 beds across Immigration Removal Centres (IRCs) at Campsfield and Haslar. This increase will ensure there is additional capacity to facilitate higher levels of enforcement and returns so that rules are properly respected. Building on 9 successful returns flights in the last six weeks, including the largest-ever chartered return flight, the government is redeploying personnel and resources to support further activity. Staff are being redeployed to increase removal of failed asylum seekers, which had dropped by 40% since 2010. Three hundred caseworkers have already been reassigned to progress thousands of failed asylum and returns cases, including enforced and voluntary returns. Enhanced digital capabilities will be deployed to ensure consistent contact throughout, preventing those with no right to be here from disappearing into exploitative illegal working and ensure they can be returned. This enforcement surge, overseen by Bas Javid, the Home Office’s Director General for Immigration Enforcement, is part of the government’s plans to transform the asylum system and secure UK borders. This will ensure that all Immigration Enforcement processes are implemented firmly, fairly, and accurately throughout, whilst also taking account of the important lessons learnt from Windrush. We are taking strong and clear steps boost our border security and ensure the rules are respected and enforced. Our new Border Security Command is already gearing up, with new staff being urgently recruited and additional staff already stationed across Europe, working with European enforcement agencies to find every route in to smashing the criminal smuggling gangs organising dangerous boat crossings which undermine our border security and putting lives at risk. And by increasing enforcement capabilities and returns we will establish a system that is better controlled and managed, in place of the chaos that has blighted the system for far too long. NCA Director General of Operations Rob Jones said: Tackling organised immigration crime remains a key priority for the NCA and we are dedicating more effort and resource than ever before. These extra officers will play a key role in that, with the NCA currently leading around 70 investigations into the highest harm people smuggling and trafficking groups. Taking on these dangerous and exploitative gangs requires international co-operation and we continue to further enhance our already strong relationship with Europol and other law enforcement partners. We are determined to do all we can to disrupt and dismantle these networks, whether they are operating in the UK or overseas. This work builds on the Prime Minister’s meeting at the European Political Community last month, where he held discussions with the migration working group alongside Italy, Albania, Germany, Malta, Denmark, Hungary, The Netherlands, and Slovakia. The European leaders discussed border security, their joint efforts to tackle people-smuggling, and the ambition to work collectively with other countries to deliver solutions. Since taking office, the Home Secretary has also held calls with a range of partners to discuss increasing cooperation to tackle organised immigration crime.
|
41 |
+
|
42 |
+
Summary events and their dates in the JSON format.
|
43 |
+
|
44 |
+
Output:
|
45 |
+
|
46 |
+
{
|
47 |
+
"events": [
|
48 |
+
{
|
49 |
+
"event": "Home Secretary Yvette Cooper announces new measures to boost Britain's border security, including the recruitment of up to 100 new specialist intelligence and investigation officers at the National Crime Agency (NCA).",
|
50 |
+
"date": "2024-08-21"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"event": "Announcement of a major surge in immigration enforcement and returns activity to achieve the highest rate of removals of those with no right to be in the UK since 2018.",
|
54 |
+
"date": "2024-08-21"
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"event": "Introduction of a new intelligence-driven illegal working programme to target and take down employers who illegally employ individuals with no right to work in the UK.",
|
58 |
+
"date": "2024-08-21"
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"event": "The government announces increased detention capacity, including 290 added beds at Campsfield and Haslar Immigration Removal Centres.",
|
62 |
+
"date": "2024-08-21"
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"event": "The Prime Minister's meeting at the European Political Community last month, where discussions were held with European leaders on border security and tackling people-smuggling.",
|
66 |
+
"date": "Last month (relative to 2024-08-21)"
|
67 |
+
}
|
68 |
+
]
|
69 |
+
}
|
system/initial_searching.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import requests
|
5 |
+
import pandas as pd
|
6 |
+
from datetime import datetime
|
7 |
+
from pathlib import Path
|
8 |
+
import spacy
|
9 |
+
import subprocess
|
10 |
+
|
11 |
+
try:
|
12 |
+
nlp = spacy.load("en_core_web_sm")
|
13 |
+
except OSError:
|
14 |
+
print("🔁 Downloading en_core_web_sm model ...")
|
15 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
|
16 |
+
nlp = spacy.load("en_core_web_sm")
|
17 |
+
|
18 |
+
def clean_keywords(text):
|
19 |
+
doc = nlp(text)
|
20 |
+
keywords = []
|
21 |
+
for chunk in doc.noun_chunks:
|
22 |
+
words = [token.text for token in chunk if not token.is_stop and token.is_alpha]
|
23 |
+
if words:
|
24 |
+
cleaned_phrase = " ".join(words)
|
25 |
+
if len(cleaned_phrase) > 2:
|
26 |
+
keywords.append(cleaned_phrase)
|
27 |
+
return list(set(keywords))
|
28 |
+
|
29 |
+
def google_search(query, api_key, search_engine_id, start_date, end_date):
|
30 |
+
print(f"[SYSTEM] Calling Google Search API for: {query}")
|
31 |
+
sort = f"date:r:{start_date}:{end_date}"
|
32 |
+
url = "https://www.googleapis.com/customsearch/v1"
|
33 |
+
params = {
|
34 |
+
"q": query,
|
35 |
+
"key": api_key,
|
36 |
+
"cx": search_engine_id,
|
37 |
+
"num": 10,
|
38 |
+
"sort": sort,
|
39 |
+
"cr": "countryUK",
|
40 |
+
"gl": "uk"
|
41 |
+
}
|
42 |
+
try:
|
43 |
+
response = requests.get(url, params=params)
|
44 |
+
response.raise_for_status()
|
45 |
+
return response.json().get("items", [])
|
46 |
+
except Exception as e:
|
47 |
+
print(f"[ERROR] Google Search Failed: {e}")
|
48 |
+
return []
|
49 |
+
|
50 |
+
def save_tsv(file_path, claim_id, claim_text, url_list):
|
51 |
+
df = pd.DataFrame({
|
52 |
+
'ID': [claim_id] * len(url_list),
|
53 |
+
'String': ["claim"] * len(url_list),
|
54 |
+
'ListValue': url_list,
|
55 |
+
'query': [claim_text] * len(url_list)
|
56 |
+
})
|
57 |
+
df.to_csv(file_path, sep='\t', index=False, header=False)
|
58 |
+
|
59 |
+
def ensure_directory_exists(path):
|
60 |
+
dir_path = Path(path).expanduser().resolve().parent
|
61 |
+
if not str(dir_path).startswith("/home") and not str(dir_path).startswith("/data") and not str(dir_path).startswith("outputs"):
|
62 |
+
raise ValueError(f"[ERROR] Unsafe path: {dir_path}")
|
63 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
64 |
+
|
65 |
+
def run_initial_searching(claim_text, pipeline_base_dir, start_date, end_date, user_id, claim_id):
|
66 |
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
67 |
+
search_engine_id = os.environ.get("GOOGLE_SEARCH_CX")
|
68 |
+
if not api_key or not search_engine_id:
|
69 |
+
raise EnvironmentError("[ERROR] GOOGLE_API_KEY and GOOGLE_SEARCH_CX must be set in environment.")
|
70 |
+
|
71 |
+
base_dir = pipeline_base_dir
|
72 |
+
manifesto_json_file = os.path.join(base_dir,"claim.json")
|
73 |
+
tsv_file_path = os.path.join(base_dir,"initial_search_results.tsv")
|
74 |
+
|
75 |
+
ensure_directory_exists(tsv_file_path)
|
76 |
+
|
77 |
+
claim_record = {"claim_id": claim_id, "claim": claim_text}
|
78 |
+
# if manifesto_json_file.exists():
|
79 |
+
# with open(manifesto_json_file, "r") as f:
|
80 |
+
# records = json.load(f)
|
81 |
+
# else:
|
82 |
+
records = []
|
83 |
+
records.append(claim_record)
|
84 |
+
with open(manifesto_json_file, "w") as f:
|
85 |
+
json.dump(records, f, indent=1)
|
86 |
+
|
87 |
+
urls = []
|
88 |
+
results = google_search(f"{claim_text}", api_key, search_engine_id, start_date, end_date)
|
89 |
+
urls += [r["link"] for r in results if "link" in r]
|
90 |
+
keywords = clean_keywords(claim_text)
|
91 |
+
keyword_text = " ".join(keywords)
|
92 |
+
# for kw in keywords:
|
93 |
+
# results = google_search(kw, api_key, search_engine_id, start_date, end_date)
|
94 |
+
# urls += [r["link"] for r in results if "link" in r]
|
95 |
+
results = google_search(keyword_text, api_key, search_engine_id, start_date, end_date)
|
96 |
+
urls += [r["link"] for r in results if "link" in r]
|
97 |
+
urls = list(dict.fromkeys(urls))
|
98 |
+
|
99 |
+
save_tsv(str(tsv_file_path), claim_id, claim_text, urls)
|
100 |
+
print(f"[SYSTEM] Saved {len(urls)} URLs for claim {claim_id} to {tsv_file_path}")
|
101 |
+
return str(tsv_file_path), str(manifesto_json_file)
|
system/instruction.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are given a pledge, the pledge speaker, and the date of when the pledge is made, and a key event summarized from an online article along with the date of when the event happens. Your task is to determine whether this event summary is useful to track the fulfilment of this pledge.
|
2 |
+
|
3 |
+
Yes:
|
4 |
+
The summary presents developments or actions that demonstrate progress (or lack thereof) towards fulfilling the pledge. It helps evaluate whether the pledge is on track or not.
|
5 |
+
|
6 |
+
No:
|
7 |
+
The summary only provides background or contextual information, but no progress information for evaluating the fulfilment of the pledge. Or the summary is less than or not related to the pledge.
|
system/pledge_tracking.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import login
|
2 |
+
from datetime import datetime
|
3 |
+
import os, time
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from system.initial_searching import run_initial_searching
|
7 |
+
from system.scraper import run_scraper
|
8 |
+
from system.hero_pipeline import run_hero_pipeline, run_hero_reranking
|
9 |
+
from system.augmented_searching import run_augmented_searching
|
10 |
+
from system.generate_output import process_manifesto_data_with_metadata
|
11 |
+
from system.ee import run_gpt4_event_extraction
|
12 |
+
from system.process_time import extract_and_sort_events
|
13 |
+
import spacy
|
14 |
+
import subprocess
|
15 |
+
from huggingface_hub import hf_hub_download
|
16 |
+
import json
|
17 |
+
|
18 |
+
try:
|
19 |
+
spacy.load("en_core_web_sm")
|
20 |
+
except OSError:
|
21 |
+
print("🔁 Downloading en_core_web_sm model ...")
|
22 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
23 |
+
nlp = spacy.load("en_core_web_sm")
|
24 |
+
|
25 |
+
|
26 |
+
def count_total_events(output_path):
|
27 |
+
with open(output_path, "r", encoding="utf-8") as f:
|
28 |
+
results = json.load(f)
|
29 |
+
|
30 |
+
total_events = 0
|
31 |
+
for item in results:
|
32 |
+
try:
|
33 |
+
events = item["output"]
|
34 |
+
if isinstance(events, list):
|
35 |
+
total_events += len(events)
|
36 |
+
else:
|
37 |
+
print(f"invalid: {events}")
|
38 |
+
except KeyError:
|
39 |
+
print(f"lack item: {item}")
|
40 |
+
|
41 |
+
print(f"{total_events} events in total")
|
42 |
+
return total_events
|
43 |
+
|
44 |
+
def run_pipeline(claim, pledge_date, pledge_author, start_date, timestamp, user_id, update_fn=None, suggestion_meta=None):
|
45 |
+
pipeline_base_dir = f"outputs/{timestamp}_{user_id}"
|
46 |
+
os.makedirs(pipeline_base_dir, exist_ok=True)
|
47 |
+
|
48 |
+
step_id=1
|
49 |
+
|
50 |
+
# Step 1: Google 搜索
|
51 |
+
if suggestion_meta==None:
|
52 |
+
|
53 |
+
|
54 |
+
print("🔍 Step 1: Initial searching ...")
|
55 |
+
initial_tsv_file, claim_json_path = run_initial_searching(
|
56 |
+
claim_text=f"{pledge_author} : {claim}",
|
57 |
+
# pledge_author=pledge_author,
|
58 |
+
pipeline_base_dir=pipeline_base_dir,
|
59 |
+
start_date=start_date,
|
60 |
+
end_date="",
|
61 |
+
user_id=user_id,
|
62 |
+
claim_id=0,
|
63 |
+
)
|
64 |
+
with open(initial_tsv_file, "r", encoding="utf-8") as f:
|
65 |
+
line_count = sum(1 for line in f)
|
66 |
+
if update_fn:
|
67 |
+
update_fn(step_id, f"We have found {line_count} URLs")
|
68 |
+
step_id+=1
|
69 |
+
|
70 |
+
|
71 |
+
print("🌐 Step 2: Scraping URLs ...")
|
72 |
+
initial_data_store_dir = os.path.join(pipeline_base_dir, "initial_data_store")
|
73 |
+
os.makedirs(initial_data_store_dir, exist_ok=True)
|
74 |
+
initial_scraped_output_path = os.path.join(initial_data_store_dir, "0.jsonl")
|
75 |
+
run_scraper(initial_tsv_file, initial_scraped_output_path)
|
76 |
+
|
77 |
+
with open(initial_scraped_output_path, "r", encoding="utf-8") as f:
|
78 |
+
line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
|
79 |
+
if update_fn:
|
80 |
+
update_fn(step_id, f"We have scraped {line_count} URLs")
|
81 |
+
step_id+=1
|
82 |
+
|
83 |
+
|
84 |
+
print("🧠 Step 3: HerO processing ...")
|
85 |
+
hero_output_dir = os.path.join(pipeline_base_dir, "hero")
|
86 |
+
os.makedirs(hero_output_dir, exist_ok=True)
|
87 |
+
run_hero_pipeline(pipeline_base_dir)
|
88 |
+
|
89 |
+
qa_file_path = os.path.join(hero_output_dir, "manifesto_icl_top_k_qa.json")
|
90 |
+
|
91 |
+
with open(qa_file_path, "r", encoding="utf-8") as f:
|
92 |
+
questions = {line["question"] for line in json.load(f)["evidence"]}
|
93 |
+
line_count = len(questions)
|
94 |
+
if update_fn:
|
95 |
+
update_fn(step_id, f"We have generated {line_count} search queries")
|
96 |
+
step_id+=1
|
97 |
+
|
98 |
+
else:
|
99 |
+
claim_json_path = None
|
100 |
+
initial_scraped_output_path = None
|
101 |
+
initial_tsv_file = None
|
102 |
+
hero_output_dir = None
|
103 |
+
qa_file_path = hf_hub_download(
|
104 |
+
repo_id="PledgeTracker/demo_feedback",
|
105 |
+
filename="manifesto_with_QA_icl_top_k_qa.json",
|
106 |
+
repo_type="dataset",
|
107 |
+
token=os.environ["HF_TOKEN"]
|
108 |
+
)
|
109 |
+
print(qa_file_path)
|
110 |
+
|
111 |
+
|
112 |
+
augmented_tsv_file = run_augmented_searching(
|
113 |
+
qa_file=qa_file_path,
|
114 |
+
pledge_author=pledge_author,
|
115 |
+
pipeline_base_dir=pipeline_base_dir,
|
116 |
+
start_date=start_date,
|
117 |
+
suggestion_meta=suggestion_meta,
|
118 |
+
end_date="",
|
119 |
+
user_id=user_id,
|
120 |
+
claim_id=0,
|
121 |
+
)
|
122 |
+
with open(augmented_tsv_file, "r", encoding="utf-8") as f:
|
123 |
+
line_count = sum(1 for line in f)
|
124 |
+
if update_fn:
|
125 |
+
update_fn(step_id, f"We have found {line_count} URLs")
|
126 |
+
step_id+=1
|
127 |
+
|
128 |
+
augmented_data_store_dir = os.path.join(pipeline_base_dir, "augmented_data_store")
|
129 |
+
os.makedirs(augmented_data_store_dir, exist_ok=True)
|
130 |
+
augmented_scraped_output_path = os.path.join(augmented_data_store_dir, "0.jsonl")
|
131 |
+
run_scraper(augmented_tsv_file, augmented_scraped_output_path)
|
132 |
+
|
133 |
+
with open(augmented_scraped_output_path, "r", encoding="utf-8") as f:
|
134 |
+
line_count = sum(1 for line in f if json.loads(line)["url2text"] != [])
|
135 |
+
if update_fn:
|
136 |
+
update_fn(step_id, f"We have scraped {line_count} URLs")
|
137 |
+
step_id+=1
|
138 |
+
|
139 |
+
|
140 |
+
run_hero_reranking(pipeline_base_dir, suggestion_meta)
|
141 |
+
|
142 |
+
# Step 7: Preparing for GPT-4
|
143 |
+
# print("🧠 Step 7: Processing format ...")
|
144 |
+
|
145 |
+
meta_data_dir = process_manifesto_data_with_metadata(input_base_dir=pipeline_base_dir)
|
146 |
+
|
147 |
+
# Step 8: Event extraction using GPT-4
|
148 |
+
print("🧠 Extracting events ...")
|
149 |
+
|
150 |
+
all_info_path = os.path.join(pipeline_base_dir, "all_info_with_txt.json")
|
151 |
+
unique_urls = set()
|
152 |
+
with open(all_info_path, "r", encoding="utf-8") as f:
|
153 |
+
for line in f:
|
154 |
+
data = json.loads(line)
|
155 |
+
docs = data.get("evidence", [])
|
156 |
+
for doc in docs:
|
157 |
+
if "url" in doc:
|
158 |
+
unique_urls.add(doc["url"])
|
159 |
+
if update_fn:
|
160 |
+
update_fn(step_id, f"We have found {len(unique_urls)} most relevant documents")
|
161 |
+
step_id+=1
|
162 |
+
|
163 |
+
extracted_event_path = run_gpt4_event_extraction(data_dir=pipeline_base_dir, icl_path="system/icl.txt", max_tokens=100000)
|
164 |
+
|
165 |
+
events_num = count_total_events(extracted_event_path)
|
166 |
+
|
167 |
+
if update_fn:
|
168 |
+
update_fn(step_id, f"We have extracted {events_num}")
|
169 |
+
step_id+=1
|
170 |
+
|
171 |
+
|
172 |
+
# Step 9: Sorting events and label usefulness
|
173 |
+
print("📅 Sorting events temporally ...")
|
174 |
+
|
175 |
+
|
176 |
+
sorted_events = extract_and_sort_events(
|
177 |
+
data_dir=pipeline_base_dir,
|
178 |
+
pledge_date=pledge_date,
|
179 |
+
pledge_author=pledge_author,
|
180 |
+
claim=claim,
|
181 |
+
suggestion_meta=suggestion_meta
|
182 |
+
)
|
183 |
+
print(sorted_events)
|
184 |
+
df = pd.DataFrame(sorted_events)
|
185 |
+
sorted_event_path = f"{pipeline_base_dir}/sorted_events.xlsx"
|
186 |
+
df.to_excel(sorted_event_path, index=False)
|
187 |
+
|
188 |
+
if update_fn:
|
189 |
+
update_fn(step_id, "All events are sorted!")
|
190 |
+
|
191 |
+
return {
|
192 |
+
"claim_json": claim_json_path,
|
193 |
+
"initial_scraped_jsonl": initial_scraped_output_path,
|
194 |
+
"initial_tsv_file": initial_tsv_file,
|
195 |
+
"hero_dir": hero_output_dir,
|
196 |
+
"augmented_scraped_jsonl": augmented_scraped_output_path,
|
197 |
+
"augmented_tsv_file": augmented_tsv_file,
|
198 |
+
"meta_data_dir": meta_data_dir,
|
199 |
+
"unsorted_events": extracted_event_path,
|
200 |
+
"sorted_events": sorted_event_path,
|
201 |
+
}
|
202 |
+
|
203 |
+
|
204 |
+
if __name__ == "__main__":
|
205 |
+
start = time.time()
|
206 |
+
|
207 |
+
if os.environ.get("HF_TOKEN"):
|
208 |
+
login(token=os.environ["HF_TOKEN"])
|
209 |
+
else:
|
210 |
+
print("No Hugging Face token found in environment variable HF_TOKEN.")
|
211 |
+
|
212 |
+
claim = "“We will support families with children by introducing free breakfast clubs in every primary school”"
|
213 |
+
start_date = "20250504"
|
214 |
+
timestamp = "xxxxx"
|
215 |
+
user_id = "xxx"
|
216 |
+
|
217 |
+
outputs = run_pipeline(claim, time_start, timestamp, user_id)
|
218 |
+
print("🎯 Pipeline finished. Outputs:", outputs)
|
219 |
+
print(f"⏱️ Total time: {time.time() - start:.2f} seconds")
|
system/process_time.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import datetime
|
3 |
+
import re
|
4 |
+
import pandas as pd
|
5 |
+
import os, argparse
|
6 |
+
import random
|
7 |
+
import csv
|
8 |
+
from openai import OpenAI
|
9 |
+
from huggingface_hub import hf_hub_download
|
10 |
+
import json
|
11 |
+
import os
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
def gpt_4o_useful(input):
|
16 |
+
client=OpenAI(api_key=os.environ.get("OAI"))
|
17 |
+
response = client.chat.completions.create(
|
18 |
+
model="gpt-4o",
|
19 |
+
messages=[
|
20 |
+
{
|
21 |
+
"role": "user",
|
22 |
+
"content": [
|
23 |
+
{
|
24 |
+
"type": "text",
|
25 |
+
"text": input
|
26 |
+
}
|
27 |
+
]
|
28 |
+
}
|
29 |
+
],
|
30 |
+
response_format={"type": "text"},
|
31 |
+
temperature=0.0000000001,
|
32 |
+
max_tokens=4096,
|
33 |
+
top_p=0,
|
34 |
+
frequency_penalty=0,
|
35 |
+
presence_penalty=0,
|
36 |
+
logprobs=True
|
37 |
+
)
|
38 |
+
|
39 |
+
text = response.choices[0].message.content
|
40 |
+
|
41 |
+
if response.choices[0].logprobs and response.choices[0].logprobs.content:
|
42 |
+
first_token_logprob = response.choices[0].logprobs.content[0]
|
43 |
+
token = first_token_logprob.token
|
44 |
+
logprob = first_token_logprob.logprob
|
45 |
+
else:
|
46 |
+
token = None
|
47 |
+
logprob = None
|
48 |
+
|
49 |
+
return text, token, logprob
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
def get_ICL(data, top_k=None):
|
54 |
+
|
55 |
+
ICL =""
|
56 |
+
if top_k == None:
|
57 |
+
data = data
|
58 |
+
else:
|
59 |
+
# print(data)
|
60 |
+
data = data[:top_k]
|
61 |
+
for line in data:
|
62 |
+
# line = json.loads(line)
|
63 |
+
pledge = line["pledge"]
|
64 |
+
event = line["event_description"]
|
65 |
+
time = line["event_date"]
|
66 |
+
input=f"Pledge: {pledge}\nEvent Summary: {event} (Event Date: {time})\nIs this event summary useful?"
|
67 |
+
input = input.strip()
|
68 |
+
output = line["label"].strip()
|
69 |
+
ICL = f"{ICL}Input: {input}\nOutput: {output}\n\n"
|
70 |
+
return ICL
|
71 |
+
|
72 |
+
def load_json(file_path):
|
73 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
74 |
+
data = json.load(f)
|
75 |
+
return data
|
76 |
+
|
77 |
+
|
78 |
+
def gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=None):
|
79 |
+
|
80 |
+
if suggestion_meta:
|
81 |
+
# print(ICL_id)
|
82 |
+
|
83 |
+
train_data = [line for line in train_data if str(line.get("pledge_id")) == str(ICL_id)]
|
84 |
+
|
85 |
+
else:
|
86 |
+
random.seed(42)
|
87 |
+
random.shuffle(train_data)
|
88 |
+
|
89 |
+
ICL = get_ICL(train_data, top_k=50)
|
90 |
+
# print(ICL)
|
91 |
+
input = f"{instruction}\nBelow are examples:\n\n{ICL}Now, please assign a label for the below instance.\nInput: {test_instance}\nOutput:"
|
92 |
+
|
93 |
+
try:
|
94 |
+
text, tokens, logprobs = gpt_4o_useful(input)
|
95 |
+
except Exception as e:
|
96 |
+
print(e)
|
97 |
+
tokens = None
|
98 |
+
logprobs = None
|
99 |
+
|
100 |
+
return tokens, logprobs
|
101 |
+
|
102 |
+
def extract_columns_to_dict(file_path, delimiter='\t'):
|
103 |
+
|
104 |
+
data_dict = {}
|
105 |
+
|
106 |
+
with open(file_path, mode='r', encoding='utf-8') as file:
|
107 |
+
reader = csv.reader(file, delimiter=delimiter)
|
108 |
+
for row in reader:
|
109 |
+
if len(row) >= 4:
|
110 |
+
key = row[2]
|
111 |
+
value = row[3]
|
112 |
+
data_dict[key] = value
|
113 |
+
|
114 |
+
return data_dict
|
115 |
+
|
116 |
+
|
117 |
+
def parse_date(date_str):
|
118 |
+
try:
|
119 |
+
return datetime.datetime.strptime(date_str, "%Y-%m-%d"), date_str
|
120 |
+
except ValueError:
|
121 |
+
match = re.search(r'(.*) \(relative to (\d{4}-\d{2}-\d{2})\)', date_str)
|
122 |
+
if match:
|
123 |
+
reference = datetime.datetime.strptime(match.group(2), "%Y-%m-%d")
|
124 |
+
if "Last month" in match.group(1):
|
125 |
+
return reference - datetime.timedelta(days=30), date_str
|
126 |
+
elif "Yesterday" in match.group(1):
|
127 |
+
return reference - datetime.timedelta(days=1), date_str
|
128 |
+
elif "Last week" in match.group(1):
|
129 |
+
return reference - datetime.timedelta(days=7), date_str
|
130 |
+
elif "This week" in match.group(1):
|
131 |
+
return reference, date_str
|
132 |
+
|
133 |
+
# 处理不同格式的日期
|
134 |
+
match = re.fullmatch(r'\d{4}', date_str) # 处理年份格式: '2014'
|
135 |
+
if match:
|
136 |
+
return datetime.datetime(int(date_str), 1, 1), date_str
|
137 |
+
|
138 |
+
match = re.fullmatch(r'(\w+) (\d{4})', date_str) # 处理月份+年份格式: 'November 2023'
|
139 |
+
if match:
|
140 |
+
try:
|
141 |
+
return datetime.datetime.strptime(date_str, "%B %Y"), date_str
|
142 |
+
except ValueError:
|
143 |
+
return None, date_str
|
144 |
+
|
145 |
+
match = re.fullmatch(r'(\d{4})-Q(\d)', date_str) # 处理季度格式: '2024-Q1'
|
146 |
+
if match:
|
147 |
+
year, quarter = int(match.group(1)), int(match.group(2))
|
148 |
+
month = (quarter - 1) * 3 + 1
|
149 |
+
return datetime.datetime(year, month, 1), date_str
|
150 |
+
|
151 |
+
match = re.fullmatch(r'(\d{4}) (Spring|Summer|Autumn|Fall|Winter)', date_str, re.IGNORECASE) # 处理季度名称格式: '2023 Autumn' 或 '2023 Fall'
|
152 |
+
if match:
|
153 |
+
year = int(match.group(1))
|
154 |
+
season_map = {"Spring": 3, "Summer": 6, "Autumn": 9, "Fall": 9, "Winter": 12}
|
155 |
+
month = season_map[match.group(2).capitalize()]
|
156 |
+
return datetime.datetime(year, month, 1), date_str
|
157 |
+
|
158 |
+
return None, date_str
|
159 |
+
|
160 |
+
def extract_and_sort_events(data_dir, pledge_date, pledge_author, claim, suggestion_meta):
|
161 |
+
|
162 |
+
events = []
|
163 |
+
|
164 |
+
# url_path = os.path.join(data_dir, "augmented_search_results.tsv")
|
165 |
+
# url_query_dict = extract_columns_to_dict(file_path=url_path, delimiter='\t')
|
166 |
+
|
167 |
+
pledge = claim.strip()
|
168 |
+
|
169 |
+
file_path = os.path.join(data_dir, "gpt4_event_extraction", "gpt4o_results_0_claim.json")
|
170 |
+
gpt4_results_json = load_json(file_path)
|
171 |
+
|
172 |
+
print(gpt4_results_json)
|
173 |
+
train_file_path = hf_hub_download(
|
174 |
+
repo_id="PledgeTracker/demo_feedback",
|
175 |
+
filename="train_useful.json",
|
176 |
+
repo_type="dataset",
|
177 |
+
token=os.environ["HF_TOKEN"]
|
178 |
+
)
|
179 |
+
|
180 |
+
with open(train_file_path, "r", encoding="utf-8") as f:
|
181 |
+
train_data = json.load(f)
|
182 |
+
print(train_data[0])
|
183 |
+
|
184 |
+
instruction = open(f"system/instruction.txt", "r").read()
|
185 |
+
|
186 |
+
map_file_path = hf_hub_download(
|
187 |
+
repo_id="PledgeTracker/demo_feedback",
|
188 |
+
filename="mapping.txt",
|
189 |
+
repo_type="dataset",
|
190 |
+
token=os.environ["HF_TOKEN"]
|
191 |
+
)
|
192 |
+
mapping_f = open(map_file_path, "r").readlines()
|
193 |
+
mapping = {}
|
194 |
+
|
195 |
+
for map_id, line in enumerate(mapping_f):
|
196 |
+
mapping[map_id] = int(line.strip())
|
197 |
+
|
198 |
+
ICL_id = None
|
199 |
+
if suggestion_meta:
|
200 |
+
try:
|
201 |
+
idx = int(suggestion_meta["index"])
|
202 |
+
ICL_id = mapping.get(idx)
|
203 |
+
print(f"[Suggestion] index: {idx} → pledge_id: {ICL_id}")
|
204 |
+
except Exception as e:
|
205 |
+
print(f"[Mapping error]: {e}")
|
206 |
+
|
207 |
+
for doc in gpt4_results_json:
|
208 |
+
mete_date = doc["date"]
|
209 |
+
for event in doc.get("output", {}).get("events", []):
|
210 |
+
parsed_date, original_date = parse_date(event["date"])
|
211 |
+
if parsed_date:
|
212 |
+
if mete_date!= parsed_date:
|
213 |
+
event_date_and_pub_date = original_date+f" ({mete_date})"
|
214 |
+
else:
|
215 |
+
event_date_and_pub_date = original_date
|
216 |
+
|
217 |
+
test_instance = f"Pledge: {pledge} (Speaker: {pledge_author}; Pledge Date: {pledge_date})\nEvent Summary: {event['event']} (Event Date: {original_date})\nIs this event summary useful?"
|
218 |
+
|
219 |
+
print(test_instance)
|
220 |
+
|
221 |
+
label, score = gpt_eval(test_instance, train_data, instruction, suggestion_meta, ICL_id=ICL_id)
|
222 |
+
|
223 |
+
URL = doc["url"]
|
224 |
+
events.append({
|
225 |
+
"date": original_date,
|
226 |
+
"event date (publication date if different)": event_date_and_pub_date,
|
227 |
+
"event": event["event"],
|
228 |
+
"url": URL,
|
229 |
+
"label": label,
|
230 |
+
"confident": score
|
231 |
+
})
|
232 |
+
|
233 |
+
# 按时间排序
|
234 |
+
events.sort(key=lambda x: parse_date(x["date"])[0], reverse=True)
|
235 |
+
return events
|
236 |
+
|
237 |
+
|
238 |
+
|
system/scraper.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
2 |
+
import os
|
3 |
+
import csv
|
4 |
+
import json
|
5 |
+
import fitz
|
6 |
+
import time
|
7 |
+
import requests
|
8 |
+
import pandas as pd
|
9 |
+
from time import sleep
|
10 |
+
from pathlib import Path
|
11 |
+
from system.html2lines import url2lines, line_correction, html2metadata
|
12 |
+
|
13 |
+
MAX_RETRIES = 3
|
14 |
+
TIMEOUT = 5 # seconds
|
15 |
+
|
16 |
+
|
17 |
+
def scrape_text_from_url(url, temp_name):
|
18 |
+
response = None
|
19 |
+
for attempt in range(MAX_RETRIES):
|
20 |
+
try:
|
21 |
+
response = requests.get(url, timeout=TIMEOUT)
|
22 |
+
break
|
23 |
+
except requests.RequestException:
|
24 |
+
if attempt < MAX_RETRIES - 1:
|
25 |
+
sleep(3)
|
26 |
+
|
27 |
+
if response is None or response.status_code == 503:
|
28 |
+
return []
|
29 |
+
|
30 |
+
if url.endswith(".pdf"):
|
31 |
+
pdf_dir = Path("/tmp/pdf_dir")
|
32 |
+
pdf_dir.mkdir(parents=True, exist_ok=True)
|
33 |
+
pdf_path = pdf_dir / f"{temp_name}.pdf"
|
34 |
+
with open(pdf_path, "wb") as f:
|
35 |
+
f.write(response.content)
|
36 |
+
|
37 |
+
extracted_text = ""
|
38 |
+
doc = fitz.open(str(pdf_path))
|
39 |
+
for page in doc:
|
40 |
+
extracted_text += page.get_text() or ""
|
41 |
+
|
42 |
+
return line_correction(extracted_text.split("\n"))
|
43 |
+
|
44 |
+
return line_correction(url2lines(url))
|
45 |
+
|
46 |
+
def process_row(row, claim_id):
|
47 |
+
try:
|
48 |
+
url = row[2]
|
49 |
+
json_data = {
|
50 |
+
"claim_id": claim_id,
|
51 |
+
"type": row[1],
|
52 |
+
"query": row[3],
|
53 |
+
"url": url,
|
54 |
+
"url2text": scrape_text_from_url(url, claim_id),
|
55 |
+
"metadata": {}
|
56 |
+
}
|
57 |
+
meta = html2metadata(url)
|
58 |
+
json_data["metadata"] = {
|
59 |
+
"title": meta.get("title"),
|
60 |
+
"date": meta.get("date")
|
61 |
+
}
|
62 |
+
return json_data
|
63 |
+
except Exception as e:
|
64 |
+
print(f"[WARN] Failed to scrape {row[2]}: {e}")
|
65 |
+
return None
|
66 |
+
|
67 |
+
def run_scraper(tsv_file_path: str, output_jsonl_path: str, max_workers: int = 10):
|
68 |
+
claim_id = Path(tsv_file_path).stem
|
69 |
+
output_jsonl_path = Path(output_jsonl_path)
|
70 |
+
output_jsonl_path.parent.mkdir(parents=True, exist_ok=True)
|
71 |
+
|
72 |
+
if output_jsonl_path.exists():
|
73 |
+
print(f"[INFO] Skipping processing as output file already exists: {output_jsonl_path}")
|
74 |
+
return str(output_jsonl_path)
|
75 |
+
|
76 |
+
try:
|
77 |
+
df = pd.read_csv(tsv_file_path, sep="\t", header=None)
|
78 |
+
print("[INFO] Data loaded successfully with Pandas.")
|
79 |
+
except Exception as e:
|
80 |
+
raise RuntimeError(f"[ERROR] Failed to load TSV: {e}")
|
81 |
+
|
82 |
+
results = []
|
83 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
84 |
+
futures = [executor.submit(process_row, row, claim_id) for _, row in df.iterrows()]
|
85 |
+
for future in as_completed(futures):
|
86 |
+
result = future.result()
|
87 |
+
if result:
|
88 |
+
results.append(result)
|
89 |
+
|
90 |
+
with open(output_jsonl_path, "w", encoding="utf-8") as json_file:
|
91 |
+
for item in results:
|
92 |
+
json_file.write(json.dumps(item, ensure_ascii=False) + "\n")
|
93 |
+
|
94 |
+
print(f"[SYSTEM] Output saved to {output_jsonl_path}")
|
95 |
+
return str(output_jsonl_path)
|
system/test.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import hf_hub_download
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
file_path = hf_hub_download(
|
6 |
+
repo_id="PledgeTracker/demo_feedback", # 你的私密 dataset 名
|
7 |
+
filename="train_useful.json", # 你上传的文件名
|
8 |
+
repo_type="dataset", # 必须设置为 dataset 类型
|
9 |
+
token=os.environ["HF_TOKEN"] # 需要 HF token 才能访问私密文件
|
10 |
+
)
|
11 |
+
|
12 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
13 |
+
train_data = json.load(f)
|
14 |
+
|
15 |
+
print(train_data[0])
|
test.html
ADDED
@@ -0,0 +1,513 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8" />
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
6 |
+
<title>Pledge Tracker – Demo</title>
|
7 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
8 |
+
</head>
|
9 |
+
<body class="bg-gray-50 text-gray-800">
|
10 |
+
<header class="bg-white shadow py-4 sticky top-0 z-10">
|
11 |
+
<div class="container mx-auto flex items-center justify-between px-4">
|
12 |
+
<div class="flex items-center gap-2">
|
13 |
+
<span class="text-2xl font-bold text-purple-600">🤗</span>
|
14 |
+
<span class="font-semibold text-lg">Pledge Tracking</span>
|
15 |
+
</div>
|
16 |
+
<nav class="hidden md:flex gap-6 font-medium">
|
17 |
+
<a class="hover:text-purple-600" href="#eval-response">Track Your Pledge</a>
|
18 |
+
<a class="hover:text-purple-600" href="#about">About</a>
|
19 |
+
</nav>
|
20 |
+
</div>
|
21 |
+
</header>
|
22 |
+
|
23 |
+
<section class="py-16 bg-gradient-to-r from-purple-50 to-purple-50 text-center">
|
24 |
+
<div class="container mx-auto px-4 max-w-2xl">
|
25 |
+
<h1 class="text-3xl md:text-4xl font-extrabold mb-4">
|
26 |
+
Fact-Checking Election Promises
|
27 |
+
</h1>
|
28 |
+
<p class="text-lg text-gray-600">
|
29 |
+
Extract progress towards fulfilling the promise.
|
30 |
+
</p>
|
31 |
+
</div>
|
32 |
+
</section>
|
33 |
+
|
34 |
+
<section id="eval-response" class="py-12">
|
35 |
+
<div class="container mx-auto px-4 max-w-4xl">
|
36 |
+
<!-- <h2 class="text-2xl font-bold mb-6">Track Manifesto Pledge</h2> -->
|
37 |
+
<label for="claim" class="block text-sm font-medium mb-2">
|
38 |
+
Please enter the pledge:
|
39 |
+
</label>
|
40 |
+
<textarea
|
41 |
+
id="claim"
|
42 |
+
class="w-full border rounded-lg p-3 h-40 focus:outline-none focus:ring-2 focus:ring-purple-500"
|
43 |
+
placeholder="For example: 'We will support families with children by introducing free breakfast clubs in every primary school...'"
|
44 |
+
></textarea>
|
45 |
+
|
46 |
+
<div id="similar-suggestions" class="mt-3 text-sm text-gray-600 hidden"></div>
|
47 |
+
|
48 |
+
<div class="mt-4">
|
49 |
+
<label for="pledge-date" class="block text-sm font-medium mb-2">
|
50 |
+
When was this pledge made?
|
51 |
+
</label>
|
52 |
+
<div class="grid grid-cols-[1fr_auto] items-center gap-2">
|
53 |
+
<input
|
54 |
+
type="date"
|
55 |
+
id="pledge-date"
|
56 |
+
class="w-full border rounded-lg p-2"
|
57 |
+
/>
|
58 |
+
<button
|
59 |
+
onclick="setDefaultDate()"
|
60 |
+
type="button"
|
61 |
+
class="px-2 py-1 text-sm bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
|
62 |
+
>
|
63 |
+
Use default: 4th Jul 2024
|
64 |
+
</button>
|
65 |
+
</div>
|
66 |
+
<div id="date-warning" class="text-sm text-red-600 mt-1 hidden">
|
67 |
+
Please select a date or click the button to use the default.
|
68 |
+
</div>
|
69 |
+
</div>
|
70 |
+
|
71 |
+
<div class="mt-4">
|
72 |
+
<label for="pledge-author" class="block text-sm font-medium mb-2">
|
73 |
+
Who made this pledge?
|
74 |
+
</label>
|
75 |
+
<div class="grid grid-cols-[1fr_auto] items-center gap-2">
|
76 |
+
<input
|
77 |
+
type="text"
|
78 |
+
id="pledge-author"
|
79 |
+
class="w-full border rounded-lg p-2"
|
80 |
+
placeholder="Enter the name of the party or person"
|
81 |
+
/>
|
82 |
+
<button
|
83 |
+
onclick="setDefaultAuthor()"
|
84 |
+
type="button"
|
85 |
+
class="px-2 py-1 text-sm bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
|
86 |
+
>
|
87 |
+
Use default: Labour
|
88 |
+
</button>
|
89 |
+
</div>
|
90 |
+
<div id="author-warning" class="text-sm text-red-600 mt-1 hidden">
|
91 |
+
Please enter a speaker or click the button to use the default.
|
92 |
+
</div>
|
93 |
+
</div>
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
<label for="time-range" class="block text-sm font-medium mt-4 mb-2">
|
98 |
+
Please select a time range:
|
99 |
+
</label>
|
100 |
+
<select id="time-range" class="w-full border rounded-lg p-2">
|
101 |
+
<option value="week">Past one week</option>
|
102 |
+
<option value="month">Past one month</option>
|
103 |
+
<!-- <option value="year">From when the pledge was made</option> -->
|
104 |
+
<option value="since_pledge_date">From when the pledge was made</option>
|
105 |
+
</select>
|
106 |
+
|
107 |
+
<button
|
108 |
+
id="check"
|
109 |
+
class="mt-4 px-6 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500"
|
110 |
+
>
|
111 |
+
Let's fact check!
|
112 |
+
</button>
|
113 |
+
|
114 |
+
<div id="progress" class="mt-6 hidden border p-4 rounded-lg bg-white shadow">
|
115 |
+
<h3 class="font-semibold mb-2">System Progress</h3>
|
116 |
+
<div id="status" class="text-sm text-gray-800 font-normal leading-relaxed"></div>
|
117 |
+
</div>
|
118 |
+
|
119 |
+
|
120 |
+
<div id="result" class="mt-6 hidden border p-4 rounded-lg bg-white shadow">
|
121 |
+
<h3 class="font-semibold mb-2">Result</h3>
|
122 |
+
<p class="text-gray-700"></p>
|
123 |
+
</div>
|
124 |
+
</div>
|
125 |
+
</section>
|
126 |
+
|
127 |
+
<section id="about" class="py-12">
|
128 |
+
<div class="container mx-auto px-4 max-w-4xl">
|
129 |
+
<h2 class="text-2xl font-bold mb-6">About</h2>
|
130 |
+
<p class="text-gray-700 leading-relaxed">
|
131 |
+
This demo connects a static front-end with a Python back-end using Flask.
|
132 |
+
The back-end generates event data and returns structured events related
|
133 |
+
to a manifesto pledge.
|
134 |
+
</p>
|
135 |
+
</div>
|
136 |
+
</section>
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
<script>
|
142 |
+
let suggestedPledge = null;
|
143 |
+
let currentAbortController = null;
|
144 |
+
const feedbackData = {};
|
145 |
+
let lastUsedFile = null;
|
146 |
+
let lastUserId = null;
|
147 |
+
let lastTimestamp = null;
|
148 |
+
const checkBtn = document.getElementById("check");
|
149 |
+
|
150 |
+
const stepListStandard = {
|
151 |
+
1: "Retrieving evidence related to the pledge",
|
152 |
+
2: "Scraping documents from URLs",
|
153 |
+
3: "Generating more queries based on the retrieved evidence",
|
154 |
+
4: "Searching more articles",
|
155 |
+
5: "Scraping documents from URLs",
|
156 |
+
6: "Finding the most relevant documents",
|
157 |
+
7: "Extracting events from top documents",
|
158 |
+
8: "Sorting events temporally"
|
159 |
+
};
|
160 |
+
|
161 |
+
const stepListSuggestion = {
|
162 |
+
1: "Retrieving evidence based on genertaed queries",
|
163 |
+
2: "Scraping documents from URLs",
|
164 |
+
3: "Finding the most relevant documents",
|
165 |
+
4: "Extracting events from top documents",
|
166 |
+
5: "Sorting events temporally"
|
167 |
+
};
|
168 |
+
|
169 |
+
let stepList = stepListStandard;
|
170 |
+
|
171 |
+
function renderStatus(statusDict) {
|
172 |
+
let html = "<ul class='list-disc ml-6 space-y-1 text-sm'>";
|
173 |
+
for (let step in stepList) {
|
174 |
+
const content = statusDict?.[step] || stepList[step];
|
175 |
+
const prefix = statusDict?.[step] ? "✅" : "⏳";
|
176 |
+
html += `<li>${prefix} Step ${step}: ${content}</li>`;
|
177 |
+
}
|
178 |
+
html += "</ul>";
|
179 |
+
return html;
|
180 |
+
}
|
181 |
+
|
182 |
+
function setDefaultDate() {
|
183 |
+
const input = document.getElementById("pledge-date");
|
184 |
+
input.value = "2024-07-04";
|
185 |
+
document.getElementById("date-warning").classList.add("hidden");
|
186 |
+
}
|
187 |
+
|
188 |
+
function setDefaultAuthor() {
|
189 |
+
const input = document.getElementById("pledge-author");
|
190 |
+
input.value = "Labour";
|
191 |
+
document.getElementById("author-warning").classList.add("hidden");
|
192 |
+
}
|
193 |
+
|
194 |
+
// function setFeedback(index, answer) {
|
195 |
+
// feedbackData[index] = answer;
|
196 |
+
// const message = document.getElementById(`msg-${index}`);
|
197 |
+
// message.textContent = `✓ Selected: ${answer ? 'Yes' : 'No'}`;
|
198 |
+
// message.className = answer
|
199 |
+
// ? "text-sm text-green-600 mt-1"
|
200 |
+
// : "text-sm text-red-600 mt-1";
|
201 |
+
// }
|
202 |
+
function setFeedback(index, answer) {
|
203 |
+
feedbackData[index] = answer;
|
204 |
+
const message = document.getElementById(`msg-${index}`);
|
205 |
+
|
206 |
+
let displayText = "";
|
207 |
+
let colorClass = "";
|
208 |
+
|
209 |
+
switch(answer) {
|
210 |
+
case "not_relevant":
|
211 |
+
displayText = "Not relevant";
|
212 |
+
colorClass = "text-red-300";
|
213 |
+
break;
|
214 |
+
case "relevant_seen":
|
215 |
+
displayText = "Relevant but already seen";
|
216 |
+
colorClass = "text-grey-400";
|
217 |
+
break;
|
218 |
+
case "relevant_updated":
|
219 |
+
displayText = "Relevant and up-to-date";
|
220 |
+
colorClass = "text-blue-400";
|
221 |
+
break;
|
222 |
+
}
|
223 |
+
|
224 |
+
message.textContent = `✓ Selected: ${displayText}`;
|
225 |
+
message.className = `text-sm ${colorClass} mt-1`;
|
226 |
+
}
|
227 |
+
|
228 |
+
function pollStatus(userId, timestamp, statusElement) {
|
229 |
+
if (window.pollIntervalId) {
|
230 |
+
clearInterval(window.pollIntervalId);
|
231 |
+
}
|
232 |
+
|
233 |
+
window.pollIntervalId = setInterval(async () => {
|
234 |
+
try {
|
235 |
+
const res = await fetch(`/api/status?user_id=${userId}×tamp=${timestamp}&_=${Date.now()}`);
|
236 |
+
const data = await res.json();
|
237 |
+
|
238 |
+
// 动态渲染结构化状态
|
239 |
+
if (data.status) {
|
240 |
+
statusElement.innerHTML = renderStatus(data.status);
|
241 |
+
}
|
242 |
+
|
243 |
+
// 检查是否完成
|
244 |
+
const values = Object.values(data.status || {});
|
245 |
+
const finalText = values.join(" ").toLowerCase();
|
246 |
+
|
247 |
+
if (finalText.includes("done") || finalText.includes("finished")) {
|
248 |
+
clearInterval(window.pollIntervalId);
|
249 |
+
window.pollIntervalId = null;
|
250 |
+
statusElement.innerHTML += `<div class="mt-2 text-green-600 font-semibold">✅ All done.</div>`;
|
251 |
+
checkBtn.disabled = false;
|
252 |
+
checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
|
253 |
+
if (lastUsedFile) loadEvents(lastUsedFile);
|
254 |
+
} else if (finalText.includes("error") || finalText.includes("fail")) {
|
255 |
+
clearInterval(window.pollIntervalId);
|
256 |
+
window.pollIntervalId = null;
|
257 |
+
statusElement.innerHTML += `<div class="mt-2 text-red-600 font-semibold">❌ The process failed.</div>`;
|
258 |
+
checkBtn.disabled = false;
|
259 |
+
checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
|
260 |
+
}
|
261 |
+
} catch (err) {
|
262 |
+
clearInterval(window.pollIntervalId);
|
263 |
+
window.pollIntervalId = null;
|
264 |
+
statusElement.innerHTML = `<div class="text-red-600">❌ Failed to check status: ${err.message}</div>`;
|
265 |
+
}
|
266 |
+
}, 2000);
|
267 |
+
}
|
268 |
+
|
269 |
+
|
270 |
+
|
271 |
+
async function submitAllFeedback() {
|
272 |
+
const entries = Object.entries(feedbackData);
|
273 |
+
if (entries.length === 0) {
|
274 |
+
alert("No feedback to submit!");
|
275 |
+
return;
|
276 |
+
}
|
277 |
+
const confirmed = confirm("Submit all feedback?");
|
278 |
+
if (!confirmed) return;
|
279 |
+
|
280 |
+
const pledgeText = document.getElementById("claim").value.trim();
|
281 |
+
|
282 |
+
const res = await fetch('/api/feedback', {
|
283 |
+
method: 'POST',
|
284 |
+
headers: { 'Content-Type': 'application/json' },
|
285 |
+
body: JSON.stringify({
|
286 |
+
pledge: pledgeText,
|
287 |
+
file: lastUsedFile,
|
288 |
+
user_id: lastUserId,
|
289 |
+
timestamp: lastTimestamp,
|
290 |
+
feedback: entries.map(([index, answer]) => ({
|
291 |
+
eventIndex: index,
|
292 |
+
answer: answer
|
293 |
+
}))
|
294 |
+
})
|
295 |
+
});
|
296 |
+
|
297 |
+
alert(res.ok ? "✅ Feedback submitted successfully!" : "❌ Submission failed.");
|
298 |
+
}
|
299 |
+
|
300 |
+
async function loadEvents(file) {
|
301 |
+
const resultBox = document.getElementById("result");
|
302 |
+
const p = resultBox.querySelector("p");
|
303 |
+
resultBox.classList.remove("hidden");
|
304 |
+
|
305 |
+
try {
|
306 |
+
const fileParam = encodeURIComponent(file);
|
307 |
+
const eventsRes = await fetch(`/api/events?file=${fileParam}`);
|
308 |
+
if (!eventsRes.ok) throw new Error("Event file not found or malformed");
|
309 |
+
const data = await eventsRes.json();
|
310 |
+
if (!Array.isArray(data)) throw new Error("Unexpected data format");
|
311 |
+
|
312 |
+
p.innerHTML = `<strong>We have found ${data.length} events for this pledge.</strong><br><br>` +
|
313 |
+
data.map((e, index) => `
|
314 |
+
<div class="mb-6 border-b pb-4">
|
315 |
+
🗓️ <b>${e.date}</b>: ${e.event}<br>
|
316 |
+
🔗 <a href="${e.url}" target="_blank" class="text-purple-400 underline">Source</a>
|
317 |
+
|
318 |
+
<div class="mt-3">
|
319 |
+
<label class="block text-sm font-medium mb-2">How relevant is this event?</label>
|
320 |
+
<div class="flex flex-wrap gap-2">
|
321 |
+
<button onclick="setFeedback(${index}, 'not_relevant')"
|
322 |
+
class="px-3 py-1.5 bg-gray-100 hover:bg-gray-200 border border-gray-300 rounded-lg text-gray-700">
|
323 |
+
Not relevant
|
324 |
+
</button>
|
325 |
+
<button onclick="setFeedback(${index}, 'relevant_seen')"
|
326 |
+
class="px-3 py-1.5 bg-blue-100 hover:bg-blue-200 border border-blue-300 rounded-lg text-blue-700">
|
327 |
+
Relevant but seen
|
328 |
+
</button>
|
329 |
+
<button onclick="setFeedback(${index}, 'relevant_updated')"
|
330 |
+
class="px-3 py-1.5 bg-green-100 hover:bg-green-200 border border-green-300 rounded-lg text-green-700">
|
331 |
+
Relevant & up-to-date
|
332 |
+
</button>
|
333 |
+
</div>
|
334 |
+
<div id="msg-${index}" class="text-sm mt-1"></div>
|
335 |
+
</div>
|
336 |
+
</div>
|
337 |
+
`).join('') +
|
338 |
+
`<button onclick="submitAllFeedback()" class="mt-6 px-4 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700">
|
339 |
+
📤 Submit All Feedback
|
340 |
+
</button>
|
341 |
+
<button onclick="window.location.href='/download?file=${fileParam}'" class="mt-4 ml-4 px-4 py-2 bg-purple-600 text-white rounded-lg hover:bg-purple-700">
|
342 |
+
📅 Download Excel
|
343 |
+
</button>`;
|
344 |
+
} catch (err) {
|
345 |
+
p.textContent = `❌ Failed to load timeline: ${err.message}`;
|
346 |
+
}
|
347 |
+
}
|
348 |
+
|
349 |
+
let suggestTimer = null;
|
350 |
+
document.getElementById("claim").addEventListener("input", () => {
|
351 |
+
clearTimeout(suggestTimer);
|
352 |
+
suggestTimer = setTimeout(fetchSuggestions, 300); // 300ms delay to avoid flooding
|
353 |
+
});
|
354 |
+
|
355 |
+
async function fetchSuggestions() {
|
356 |
+
const claimText = document.getElementById("claim").value.trim();
|
357 |
+
const suggestionBox = document.getElementById("similar-suggestions");
|
358 |
+
|
359 |
+
if (!claimText) {
|
360 |
+
suggestionBox.classList.add("hidden");
|
361 |
+
return;
|
362 |
+
}
|
363 |
+
|
364 |
+
const res = await fetch("/api/similar-pledges", {
|
365 |
+
method: "POST",
|
366 |
+
headers: { "Content-Type": "application/json" },
|
367 |
+
body: JSON.stringify({ claim: claimText })
|
368 |
+
});
|
369 |
+
const data = await res.json();
|
370 |
+
const suggestions = data.suggestions || [];
|
371 |
+
|
372 |
+
if (suggestions.length === 0) {
|
373 |
+
suggestionBox.classList.add("hidden");
|
374 |
+
} else {
|
375 |
+
const author = "Labour";
|
376 |
+
const date = "2024-07-04";
|
377 |
+
suggestionBox.innerHTML =
|
378 |
+
"<div class='font-semibold mb-1'>💡 Are you fact-checking ... </div>" +
|
379 |
+
"<ul class='list-disc ml-6 mt-1'>" +
|
380 |
+
suggestions.map(s => `
|
381 |
+
<li class="mb-2">
|
382 |
+
${author}: ${s.text} (${date})
|
383 |
+
<button
|
384 |
+
onclick="useSuggestedPledge('${s.text.replace(/'/g, "\\'")}', ${s.index})"
|
385 |
+
class="ml-2 px-2 py-1 text-xs bg-purple-600 text-white rounded hover:bg-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500">
|
386 |
+
Fact-check this pledge
|
387 |
+
</button>
|
388 |
+
</li>
|
389 |
+
`).join("") +
|
390 |
+
"</ul>";
|
391 |
+
suggestionBox.classList.remove("hidden");
|
392 |
+
}
|
393 |
+
}
|
394 |
+
|
395 |
+
|
396 |
+
checkBtn.addEventListener("click", async () => {
|
397 |
+
const claim = document.getElementById("claim").value.trim();
|
398 |
+
const pledgeDate = document.getElementById("pledge-date").value.trim();
|
399 |
+
const pledgeAuthor = document.getElementById("pledge-author").value.trim();
|
400 |
+
const statusElement = document.getElementById("status");
|
401 |
+
const resultBox = document.getElementById("result");
|
402 |
+
// resultBox.classList.remove("hidden");
|
403 |
+
const p = resultBox.querySelector("p");
|
404 |
+
|
405 |
+
|
406 |
+
|
407 |
+
let valid = true;
|
408 |
+
if (!claim) {
|
409 |
+
alert("Please enter the pledge text.");
|
410 |
+
valid = false;
|
411 |
+
}
|
412 |
+
if (!pledgeDate) {
|
413 |
+
document.getElementById("date-warning").classList.remove("hidden");
|
414 |
+
valid = false;
|
415 |
+
}
|
416 |
+
if (!pledgeAuthor) {
|
417 |
+
document.getElementById("author-warning").classList.remove("hidden");
|
418 |
+
valid = false;
|
419 |
+
}
|
420 |
+
|
421 |
+
if (!valid) return;
|
422 |
+
|
423 |
+
checkBtn.disabled = true;
|
424 |
+
checkBtn.classList.add("opacity-50", "cursor-not-allowed");
|
425 |
+
|
426 |
+
// document.getElementById("status").classList.remove("hidden");
|
427 |
+
statusElement.innerHTML = renderStatus({});
|
428 |
+
document.getElementById("result").classList.remove("hidden");
|
429 |
+
document.getElementById("progress").classList.remove("hidden");
|
430 |
+
|
431 |
+
|
432 |
+
|
433 |
+
try {
|
434 |
+
const timeRange = document.getElementById("time-range").value;
|
435 |
+
const pledgeDate = document.getElementById("pledge-date").value;
|
436 |
+
const pledgeAuthor = document.getElementById("pledge-author").value;
|
437 |
+
if (currentAbortController) currentAbortController.abort();
|
438 |
+
currentAbortController = new AbortController();
|
439 |
+
const signal = currentAbortController.signal;
|
440 |
+
let valid = true;
|
441 |
+
|
442 |
+
stepList = (suggestedPledge !== null) ? stepListSuggestion : stepListStandard;
|
443 |
+
|
444 |
+
if (!pledgeDate) {
|
445 |
+
document.getElementById("date-warning").classList.remove("hidden");
|
446 |
+
valid = false;
|
447 |
+
}
|
448 |
+
if (!pledgeAuthor) {
|
449 |
+
document.getElementById("author-warning").classList.remove("hidden");
|
450 |
+
valid = false;
|
451 |
+
}
|
452 |
+
if (!valid) return;
|
453 |
+
|
454 |
+
const userId = Math.random().toString(36).substring(2, 10);
|
455 |
+
const now = new Date();
|
456 |
+
const timestamp = now.toISOString().replace(/[:.]/g, "-").slice(0, 19);
|
457 |
+
statusElement.textContent = "";
|
458 |
+
// pollStatus(userId, timestamp, p);
|
459 |
+
pollStatus(userId, timestamp, document.getElementById("status"));
|
460 |
+
|
461 |
+
|
462 |
+
const runRes = await fetch("/api/run-model", {
|
463 |
+
method: "POST",
|
464 |
+
headers: { "Content-Type": "application/json" },
|
465 |
+
body: JSON.stringify({
|
466 |
+
claim,
|
467 |
+
time_range: timeRange,
|
468 |
+
pledge_date: pledgeDate,
|
469 |
+
pledge_author: pledgeAuthor,
|
470 |
+
user_id: userId,
|
471 |
+
timestamp: timestamp,
|
472 |
+
signal: signal,
|
473 |
+
suggestion_meta: suggestedPledge
|
474 |
+
})
|
475 |
+
});
|
476 |
+
|
477 |
+
const runData = await runRes.json();
|
478 |
+
|
479 |
+
lastUsedFile = runData.file;
|
480 |
+
lastUserId = runData.user_id;
|
481 |
+
lastTimestamp = runData.timestamp;
|
482 |
+
} catch (err) {
|
483 |
+
if (err.name === "AbortError") {
|
484 |
+
console.log("Previous request aborted.");
|
485 |
+
checkBtn.disabled = false;
|
486 |
+
checkBtn.classList.remove("opacity-50", "cursor-not-allowed");
|
487 |
+
return;
|
488 |
+
}
|
489 |
+
p.textContent = `❌ Failed to load timeline: ${err.message}`;
|
490 |
+
}
|
491 |
+
|
492 |
+
});
|
493 |
+
|
494 |
+
|
495 |
+
async function useSuggestedPledge(text, index) {
|
496 |
+
document.getElementById("claim").value = text;
|
497 |
+
document.getElementById("pledge-author").value = "Labour";
|
498 |
+
document.getElementById("pledge-date").value = "2024-07-04";
|
499 |
+
suggestedPledge = { text, index };
|
500 |
+
alert("✅ This pledge has been filled in. You can now click 'Let's fact check!'");
|
501 |
+
await fetch("/api/log-similar-selection", {
|
502 |
+
method: "POST",
|
503 |
+
headers: { "Content-Type": "application/json" },
|
504 |
+
body: JSON.stringify({
|
505 |
+
selected_text: text,
|
506 |
+
index: index
|
507 |
+
})
|
508 |
+
});
|
509 |
+
}
|
510 |
+
|
511 |
+
</script>
|
512 |
+
</body>
|
513 |
+
</html>
|