Spaces:
Sleeping
Sleeping
import json | |
import os | |
import argparse | |
from system.html2lines import html2metadata | |
from lxml.etree import tostring | |
import lxml.etree | |
def process_manifesto_data_with_metadata(input_base_dir: str): | |
input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json") | |
output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json") | |
url2text_dir = os.path.join(input_base_dir, "augmented_data_store") | |
with open(input_file_path, "r", encoding="utf-8") as f: | |
input_file = f.readlines() | |
out_file = open(output_file_path, "w", encoding="utf-8") | |
i = 0 | |
for id, line in enumerate(input_file): | |
line = json.loads(line) | |
claim = line["claim"] | |
QAs = line["top_50"] | |
new_line = {"claim": claim, "evidence": []} | |
json_path = os.path.join(url2text_dir, f"{id}.jsonl") | |
if not os.path.exists(json_path): | |
print(f"Warning: {json_path} not found") | |
continue | |
with open(json_path, "r", encoding="utf-8") as f: | |
try: | |
data_store = json.load(f) | |
except json.JSONDecodeError: | |
f.seek(0) | |
data_store = [json.loads(line) for line in f] | |
url_txt = {data["url"]: data["url2text"] for data in data_store} | |
URLs = [] | |
for j, QA in enumerate(QAs): | |
newQA = QA.copy() | |
URL = QA["url"] | |
newQA["text"] = url_txt.get(URL, "") | |
if URL not in URLs: | |
try: | |
meta = html2metadata(URL) | |
if isinstance(meta, lxml.etree._Element): | |
meta = tostring(meta, encoding="unicode", pretty_print=True) | |
meta_save = { | |
"title": meta["title"], | |
"date": meta["date"] | |
} | |
except Exception as e: | |
print(f"Metadata extraction failed for URL: {URL}, error: {e}") | |
meta_save = { | |
"title": "", | |
"date": "" | |
} | |
newQA["metadata"] = meta_save | |
new_line["evidence"].append(newQA) | |
out_file.write(json.dumps(new_line) + "\n") | |
out_file.close() | |
return output_file_path | |