Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

App Files Files Community

Pledge_Tracker / system /generate_output.py

yulongchen

add

35b3f62 2 months ago

raw

history blame contribute delete

2.32 kB

	import json
	import os
	import argparse
	from system.html2lines import html2metadata
	from lxml.etree import tostring
	import lxml.etree

	def process_manifesto_data_with_metadata(input_base_dir: str):

	input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json")
	output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json")

	url2text_dir = os.path.join(input_base_dir, "augmented_data_store")

	with open(input_file_path, "r", encoding="utf-8") as f:
	input_file = f.readlines()

	out_file = open(output_file_path, "w", encoding="utf-8")


	i = 0

	for id, line in enumerate(input_file):
	line = json.loads(line)
	claim = line["claim"]
	QAs = line["top_50"]
	new_line = {"claim": claim, "evidence": []}

	json_path = os.path.join(url2text_dir, f"{id}.jsonl")
	if not os.path.exists(json_path):
	print(f"Warning: {json_path} not found")
	continue

	with open(json_path, "r", encoding="utf-8") as f:
	try:
	data_store = json.load(f)
	except json.JSONDecodeError:
	f.seek(0)
	data_store = [json.loads(line) for line in f]

	url_txt = {data["url"]: data["url2text"] for data in data_store}

	URLs = []
	for j, QA in enumerate(QAs):
	newQA = QA.copy()
	URL = QA["url"]
	newQA["text"] = url_txt.get(URL, "")

	if URL not in URLs:
	try:
	meta = html2metadata(URL)
	if isinstance(meta, lxml.etree._Element):
	meta = tostring(meta, encoding="unicode", pretty_print=True)
	meta_save = {
	"title": meta["title"],
	"date": meta["date"]
	}
	except Exception as e:
	print(f"Metadata extraction failed for URL: {URL}, error: {e}")
	meta_save = {
	"title": "",
	"date": ""
	}


	newQA["metadata"] = meta_save
	new_line["evidence"].append(newQA)

	out_file.write(json.dumps(new_line) + "\n")

	out_file.close()
	return output_file_path