Spaces:
Sleeping
Sleeping
File size: 2,317 Bytes
35b3f62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import json
import os
import argparse
from system.html2lines import html2metadata
from lxml.etree import tostring
import lxml.etree
def process_manifesto_data_with_metadata(input_base_dir: str):
input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json")
output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json")
url2text_dir = os.path.join(input_base_dir, "augmented_data_store")
with open(input_file_path, "r", encoding="utf-8") as f:
input_file = f.readlines()
out_file = open(output_file_path, "w", encoding="utf-8")
i = 0
for id, line in enumerate(input_file):
line = json.loads(line)
claim = line["claim"]
QAs = line["top_50"]
new_line = {"claim": claim, "evidence": []}
json_path = os.path.join(url2text_dir, f"{id}.jsonl")
if not os.path.exists(json_path):
print(f"Warning: {json_path} not found")
continue
with open(json_path, "r", encoding="utf-8") as f:
try:
data_store = json.load(f)
except json.JSONDecodeError:
f.seek(0)
data_store = [json.loads(line) for line in f]
url_txt = {data["url"]: data["url2text"] for data in data_store}
URLs = []
for j, QA in enumerate(QAs):
newQA = QA.copy()
URL = QA["url"]
newQA["text"] = url_txt.get(URL, "")
if URL not in URLs:
try:
meta = html2metadata(URL)
if isinstance(meta, lxml.etree._Element):
meta = tostring(meta, encoding="unicode", pretty_print=True)
meta_save = {
"title": meta["title"],
"date": meta["date"]
}
except Exception as e:
print(f"Metadata extraction failed for URL: {URL}, error: {e}")
meta_save = {
"title": "",
"date": ""
}
newQA["metadata"] = meta_save
new_line["evidence"].append(newQA)
out_file.write(json.dumps(new_line) + "\n")
out_file.close()
return output_file_path
|