Update extract.py
Browse files- extract.py +38 -17
extract.py
CHANGED
@@ -8,6 +8,7 @@ import time
|
|
8 |
from selenium.webdriver.common.by import By
|
9 |
from selenium.webdriver.support.ui import WebDriverWait
|
10 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
11 |
|
12 |
def take_webdata(url):
|
13 |
options = webdriver.ChromeOptions()
|
@@ -33,9 +34,40 @@ def take_webdata(url):
|
|
33 |
return Image.open(BytesIO(screenshot)) , page_title
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
def get_vehicle_info(plate_number: str):
|
37 |
# Configure headless Chrome
|
38 |
-
# options = Options()
|
39 |
options = webdriver.ChromeOptions()
|
40 |
options.add_argument("--headless")
|
41 |
options.add_argument("--disable-gpu")
|
@@ -44,23 +76,18 @@ def get_vehicle_info(plate_number: str):
|
|
44 |
# Path to chromedriver (adjust if needed)
|
45 |
driver = webdriver.Chrome(options=options)
|
46 |
|
47 |
-
try:
|
48 |
-
|
49 |
driver.get("https://www.jambisamsat.net/infopkb.html")
|
50 |
time.sleep(1)
|
51 |
|
52 |
-
|
53 |
-
# Wait until input box is present
|
54 |
WebDriverWait(driver, 10).until(
|
55 |
EC.presence_of_element_located((By.ID, "no_polisi"))
|
56 |
)
|
57 |
|
58 |
-
# Fill in the plate number
|
59 |
input_field = driver.find_element(By.ID, "no_polisi")
|
60 |
input_field.clear()
|
61 |
input_field.send_keys(plate_number)
|
62 |
|
63 |
-
# Click the submit button by class name
|
64 |
submit_button = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-primary[type="submit"]')
|
65 |
submit_button.click()
|
66 |
|
@@ -69,22 +96,16 @@ def get_vehicle_info(plate_number: str):
|
|
69 |
EC.url_contains("infopkb.php")
|
70 |
)
|
71 |
|
72 |
-
# # Step 2: Find the input and enter plate number
|
73 |
-
# input_element = driver.find_element(By.NAME, "nopol")
|
74 |
-
# input_element.send_keys(plate_number)
|
75 |
-
|
76 |
-
# # Step 3: Submit the form
|
77 |
-
# submit_button = driver.find_element(By.CSS_SELECTOR, 'input[type="submit"]')
|
78 |
-
# submit_button.click()
|
79 |
-
# time.sleep(2)
|
80 |
-
|
81 |
driver.implicitly_wait(3)
|
82 |
|
83 |
-
|
84 |
scroll_height = driver.execute_script("return document.body.scrollHeight")
|
85 |
driver.set_window_size(1920, scroll_height + 200) # force full-page height
|
86 |
time.sleep(1)
|
87 |
|
|
|
|
|
|
|
|
|
88 |
page_title = driver.title
|
89 |
screenshot = driver.get_screenshot_as_png()
|
90 |
|
|
|
8 |
from selenium.webdriver.common.by import By
|
9 |
from selenium.webdriver.support.ui import WebDriverWait
|
10 |
from selenium.webdriver.support import expected_conditions as EC
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
|
13 |
def take_webdata(url):
|
14 |
options = webdriver.ChromeOptions()
|
|
|
34 |
return Image.open(BytesIO(screenshot)) , page_title
|
35 |
|
36 |
|
37 |
+
def scrape_vehicle(page_source):
|
38 |
+
soup = BeautifulSoup(page_source, "html.parser")
|
39 |
+
data_kendaraan = {}
|
40 |
+
table = soup.find("table")
|
41 |
+
for row in table.find_all("tr"):
|
42 |
+
cells = row.find_all("td")
|
43 |
+
if len(cells) >= 3:
|
44 |
+
key = cells[0].get_text(strip=True).lower().replace(".", "").replace(" ", "_")
|
45 |
+
value = cells[2].get_text(strip=True)
|
46 |
+
data_kendaraan[key] = value
|
47 |
+
|
48 |
+
rincians = []
|
49 |
+
rincian_div = soup.find("div", id="det_pkb")
|
50 |
+
if rincian_div:
|
51 |
+
rows = rincian_div.find_all("div", class_="row")
|
52 |
+
for row in rows[1:]: # baris pertama adalah header
|
53 |
+
cols = row.find_all("p")
|
54 |
+
if len(cols) >= 3:
|
55 |
+
rincian = {
|
56 |
+
"pokok": cols[0].get_text(strip=True),
|
57 |
+
"denda": cols[1].get_text(strip=True),
|
58 |
+
"total": cols[2].get_text(strip=True),
|
59 |
+
}
|
60 |
+
rincian["jenis"] = cols[3].get_text(strip=True) if len(cols) > 3 else ""
|
61 |
+
rincian["jenis"] = rincian["jenis"].upper()
|
62 |
+
rincian = {k: v for k, v in rincian.items() if v}
|
63 |
+
if rincian:
|
64 |
+
rincians.append(rincian)
|
65 |
+
|
66 |
+
return data_kendaraan, rincians
|
67 |
+
|
68 |
+
|
69 |
def get_vehicle_info(plate_number: str):
|
70 |
# Configure headless Chrome
|
|
|
71 |
options = webdriver.ChromeOptions()
|
72 |
options.add_argument("--headless")
|
73 |
options.add_argument("--disable-gpu")
|
|
|
76 |
# Path to chromedriver (adjust if needed)
|
77 |
driver = webdriver.Chrome(options=options)
|
78 |
|
79 |
+
try:
|
|
|
80 |
driver.get("https://www.jambisamsat.net/infopkb.html")
|
81 |
time.sleep(1)
|
82 |
|
|
|
|
|
83 |
WebDriverWait(driver, 10).until(
|
84 |
EC.presence_of_element_located((By.ID, "no_polisi"))
|
85 |
)
|
86 |
|
|
|
87 |
input_field = driver.find_element(By.ID, "no_polisi")
|
88 |
input_field.clear()
|
89 |
input_field.send_keys(plate_number)
|
90 |
|
|
|
91 |
submit_button = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-primary[type="submit"]')
|
92 |
submit_button.click()
|
93 |
|
|
|
96 |
EC.url_contains("infopkb.php")
|
97 |
)
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
driver.implicitly_wait(3)
|
100 |
|
|
|
101 |
scroll_height = driver.execute_script("return document.body.scrollHeight")
|
102 |
driver.set_window_size(1920, scroll_height + 200) # force full-page height
|
103 |
time.sleep(1)
|
104 |
|
105 |
+
data_kendaraan, rincian = scrape_vehicle(driver.page_source)
|
106 |
+
|
107 |
+
print(data_kendaraan, rincian)
|
108 |
+
|
109 |
page_title = driver.title
|
110 |
screenshot = driver.get_screenshot_as_png()
|
111 |
|