import json import os import argparse from system.html2lines import html2metadata from lxml.etree import tostring import lxml.etree def process_manifesto_data_with_metadata(input_base_dir: str): input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json") output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json") url2text_dir = os.path.join(input_base_dir, "augmented_data_store") with open(input_file_path, "r", encoding="utf-8") as f: input_file = f.readlines() out_file = open(output_file_path, "w", encoding="utf-8") i = 0 for id, line in enumerate(input_file): line = json.loads(line) claim = line["claim"] QAs = line["top_50"] new_line = {"claim": claim, "evidence": []} json_path = os.path.join(url2text_dir, f"{id}.jsonl") if not os.path.exists(json_path): print(f"Warning: {json_path} not found") continue with open(json_path, "r", encoding="utf-8") as f: try: data_store = json.load(f) except json.JSONDecodeError: f.seek(0) data_store = [json.loads(line) for line in f] url_txt = {data["url"]: data["url2text"] for data in data_store} URLs = [] for j, QA in enumerate(QAs): newQA = QA.copy() URL = QA["url"] newQA["text"] = url_txt.get(URL, "") if URL not in URLs: try: meta = html2metadata(URL) if isinstance(meta, lxml.etree._Element): meta = tostring(meta, encoding="unicode", pretty_print=True) meta_save = { "title": meta["title"], "date": meta["date"] } except Exception as e: print(f"Metadata extraction failed for URL: {URL}, error: {e}") meta_save = { "title": "", "date": "" } newQA["metadata"] = meta_save new_line["evidence"].append(newQA) out_file.write(json.dumps(new_line) + "\n") out_file.close() return output_file_path