File size: 2,992 Bytes
35b3f62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import csv
import json
import fitz
import time
import requests
import pandas as pd
from time import sleep
from pathlib import Path
from system.html2lines import url2lines, line_correction, html2metadata

MAX_RETRIES = 3
TIMEOUT = 5  # seconds


def scrape_text_from_url(url, temp_name):
    response = None
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.get(url, timeout=TIMEOUT)
            break
        except requests.RequestException:
            if attempt < MAX_RETRIES - 1:
                sleep(3)

    if response is None or response.status_code == 503:
        return []

    if url.endswith(".pdf"):
        pdf_dir = Path("/tmp/pdf_dir")
        pdf_dir.mkdir(parents=True, exist_ok=True)
        pdf_path = pdf_dir / f"{temp_name}.pdf"
        with open(pdf_path, "wb") as f:
            f.write(response.content)

        extracted_text = ""
        doc = fitz.open(str(pdf_path))
        for page in doc:
            extracted_text += page.get_text() or ""

        return line_correction(extracted_text.split("\n"))

    return line_correction(url2lines(url))
    
def process_row(row, claim_id):
    try:
        url = row[2]
        json_data = {
            "claim_id": claim_id,
            "type": row[1],
            "query": row[3],
            "url": url,
            "url2text": scrape_text_from_url(url, claim_id),
            "metadata": {}
        }
        meta = html2metadata(url)
        json_data["metadata"] = {
            "title": meta.get("title"),
            "date": meta.get("date")
        }
        return json_data
    except Exception as e:
        print(f"[WARN] Failed to scrape {row[2]}: {e}")
        return None

def run_scraper(tsv_file_path: str, output_jsonl_path: str, max_workers: int = 10):
    claim_id = Path(tsv_file_path).stem
    output_jsonl_path = Path(output_jsonl_path)
    output_jsonl_path.parent.mkdir(parents=True, exist_ok=True)

    if output_jsonl_path.exists():
        print(f"[INFO] Skipping processing as output file already exists: {output_jsonl_path}")
        return str(output_jsonl_path)

    try:
        df = pd.read_csv(tsv_file_path, sep="\t", header=None)
        print("[INFO] Data loaded successfully with Pandas.")
    except Exception as e:
        raise RuntimeError(f"[ERROR] Failed to load TSV: {e}")

    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_row, row, claim_id) for _, row in df.iterrows()]
        for future in as_completed(futures):
            result = future.result()
            if result:
                results.append(result)

    with open(output_jsonl_path, "w", encoding="utf-8") as json_file:
        for item in results:
            json_file.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"[SYSTEM] Output saved to {output_jsonl_path}")
    return str(output_jsonl_path)