abdo-Mansour commited on
Commit
da05e38
·
1 Parent(s): df8c52d

updated the fetching method

Browse files
web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
 
web2json/__pycache__/pipeline.cpython-311.pyc CHANGED
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
 
web2json/preprocessor.py CHANGED
@@ -38,12 +38,19 @@ class Preprocessor(ABC):
38
  # Set a User-Agent header to mimic a browser, which can help avoid
39
  # being blocked by some websites.
40
  # Inside _fetch_content method
41
- headers = {
42
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
43
- 'Accept-Language': 'en-US,en;q=0.9',
44
- 'Accept-Encoding': 'gzip, deflate, br',
45
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
46
- 'Connection': 'keep-alive',
 
 
 
 
 
 
 
47
  }
48
 
49
  # Make the HTTP GET request with a timeout.
 
38
  # Set a User-Agent header to mimic a browser, which can help avoid
39
  # being blocked by some websites.
40
  # Inside _fetch_content method
41
+ headers = headers = {
42
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
43
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
44
+ "Accept-Language": "en-US,en;q=0.6",
45
+ "Cache-Control": "max-age=0",
46
+ "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
47
+ "Sec-Ch-Ua-Mobile": "?0",
48
+ "Sec-Ch-Ua-Platform": "\"Windows\"",
49
+ "Sec-Fetch-Dest": "document",
50
+ "Sec-Fetch-Mode": "navigate",
51
+ "Sec-Fetch-Site": "none",
52
+ "Sec-Fetch-User": "?1",
53
+ "Upgrade-Insecure-Requests": "1",
54
  }
55
 
56
  # Make the HTTP GET request with a timeout.