File size: 2,317 Bytes
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import json
import os
import argparse
from system.html2lines import html2metadata
from lxml.etree import tostring
import lxml.etree

def process_manifesto_data_with_metadata(input_base_dir: str):

    input_file_path = os.path.join(input_base_dir, "hero/manifesto_icl_reranking_top_k_QA.json")
    output_file_path = os.path.join(input_base_dir, "all_info_with_txt.json")

    url2text_dir = os.path.join(input_base_dir, "augmented_data_store")

    with open(input_file_path, "r", encoding="utf-8") as f:
        input_file = f.readlines()

    out_file = open(output_file_path, "w", encoding="utf-8")


    i = 0

    for id, line in enumerate(input_file):
        line = json.loads(line)
        claim = line["claim"]
        QAs = line["top_50"]
        new_line = {"claim": claim, "evidence": []}

        json_path = os.path.join(url2text_dir, f"{id}.jsonl")
        if not os.path.exists(json_path):
            print(f"Warning: {json_path} not found")
            continue

        with open(json_path, "r", encoding="utf-8") as f:
            try:
                data_store = json.load(f)
            except json.JSONDecodeError:
                f.seek(0)
                data_store = [json.loads(line) for line in f]

        url_txt = {data["url"]: data["url2text"] for data in data_store}

        URLs = []
        for j, QA in enumerate(QAs):
            newQA = QA.copy()
            URL = QA["url"]
            newQA["text"] = url_txt.get(URL, "")

            if URL not in URLs:
                try:
                    meta = html2metadata(URL)
                    if isinstance(meta, lxml.etree._Element):
                        meta = tostring(meta, encoding="unicode", pretty_print=True)
                    meta_save = {
                        "title": meta["title"],
                        "date": meta["date"]
                    }
                except Exception as e:
                    print(f"Metadata extraction failed for URL: {URL}, error: {e}")
                    meta_save = {
                        "title": "",
                        "date": ""
                    }


                newQA["metadata"] = meta_save
                new_line["evidence"].append(newQA)

        out_file.write(json.dumps(new_line) + "\n")

    out_file.close()
    return output_file_path