Commit
·
da05e38
1
Parent(s):
df8c52d
updated the fetching method
Browse files
web2json/__pycache__/ai_extractor.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
|
|
web2json/__pycache__/pipeline.cpython-311.pyc
CHANGED
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
|
|
web2json/preprocessor.py
CHANGED
@@ -38,12 +38,19 @@ class Preprocessor(ABC):
|
|
38 |
# Set a User-Agent header to mimic a browser, which can help avoid
|
39 |
# being blocked by some websites.
|
40 |
# Inside _fetch_content method
|
41 |
-
headers = {
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
}
|
48 |
|
49 |
# Make the HTTP GET request with a timeout.
|
|
|
38 |
# Set a User-Agent header to mimic a browser, which can help avoid
|
39 |
# being blocked by some websites.
|
40 |
# Inside _fetch_content method
|
41 |
+
headers = headers = {
|
42 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
43 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
44 |
+
"Accept-Language": "en-US,en;q=0.6",
|
45 |
+
"Cache-Control": "max-age=0",
|
46 |
+
"Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
|
47 |
+
"Sec-Ch-Ua-Mobile": "?0",
|
48 |
+
"Sec-Ch-Ua-Platform": "\"Windows\"",
|
49 |
+
"Sec-Fetch-Dest": "document",
|
50 |
+
"Sec-Fetch-Mode": "navigate",
|
51 |
+
"Sec-Fetch-Site": "none",
|
52 |
+
"Sec-Fetch-User": "?1",
|
53 |
+
"Upgrade-Insecure-Requests": "1",
|
54 |
}
|
55 |
|
56 |
# Make the HTTP GET request with a timeout.
|