jonathanjordan21 commited on
Commit
85d5816
·
verified ·
1 Parent(s): ef19ec0

Update extract.py

Browse files
Files changed (1) hide show
  1. extract.py +38 -17
extract.py CHANGED
@@ -8,6 +8,7 @@ import time
8
  from selenium.webdriver.common.by import By
9
  from selenium.webdriver.support.ui import WebDriverWait
10
  from selenium.webdriver.support import expected_conditions as EC
 
11
 
12
  def take_webdata(url):
13
  options = webdriver.ChromeOptions()
@@ -33,9 +34,40 @@ def take_webdata(url):
33
  return Image.open(BytesIO(screenshot)) , page_title
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def get_vehicle_info(plate_number: str):
37
  # Configure headless Chrome
38
- # options = Options()
39
  options = webdriver.ChromeOptions()
40
  options.add_argument("--headless")
41
  options.add_argument("--disable-gpu")
@@ -44,23 +76,18 @@ def get_vehicle_info(plate_number: str):
44
  # Path to chromedriver (adjust if needed)
45
  driver = webdriver.Chrome(options=options)
46
 
47
- try:
48
-
49
  driver.get("https://www.jambisamsat.net/infopkb.html")
50
  time.sleep(1)
51
 
52
-
53
- # Wait until input box is present
54
  WebDriverWait(driver, 10).until(
55
  EC.presence_of_element_located((By.ID, "no_polisi"))
56
  )
57
 
58
- # Fill in the plate number
59
  input_field = driver.find_element(By.ID, "no_polisi")
60
  input_field.clear()
61
  input_field.send_keys(plate_number)
62
 
63
- # Click the submit button by class name
64
  submit_button = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-primary[type="submit"]')
65
  submit_button.click()
66
 
@@ -69,22 +96,16 @@ def get_vehicle_info(plate_number: str):
69
  EC.url_contains("infopkb.php")
70
  )
71
 
72
- # # Step 2: Find the input and enter plate number
73
- # input_element = driver.find_element(By.NAME, "nopol")
74
- # input_element.send_keys(plate_number)
75
-
76
- # # Step 3: Submit the form
77
- # submit_button = driver.find_element(By.CSS_SELECTOR, 'input[type="submit"]')
78
- # submit_button.click()
79
- # time.sleep(2)
80
-
81
  driver.implicitly_wait(3)
82
 
83
-
84
  scroll_height = driver.execute_script("return document.body.scrollHeight")
85
  driver.set_window_size(1920, scroll_height + 200) # force full-page height
86
  time.sleep(1)
87
 
 
 
 
 
88
  page_title = driver.title
89
  screenshot = driver.get_screenshot_as_png()
90
 
 
8
  from selenium.webdriver.common.by import By
9
  from selenium.webdriver.support.ui import WebDriverWait
10
  from selenium.webdriver.support import expected_conditions as EC
11
+ from bs4 import BeautifulSoup
12
 
13
  def take_webdata(url):
14
  options = webdriver.ChromeOptions()
 
34
  return Image.open(BytesIO(screenshot)) , page_title
35
 
36
 
37
+ def scrape_vehicle(page_source):
38
+ soup = BeautifulSoup(page_source, "html.parser")
39
+ data_kendaraan = {}
40
+ table = soup.find("table")
41
+ for row in table.find_all("tr"):
42
+ cells = row.find_all("td")
43
+ if len(cells) >= 3:
44
+ key = cells[0].get_text(strip=True).lower().replace(".", "").replace(" ", "_")
45
+ value = cells[2].get_text(strip=True)
46
+ data_kendaraan[key] = value
47
+
48
+ rincians = []
49
+ rincian_div = soup.find("div", id="det_pkb")
50
+ if rincian_div:
51
+ rows = rincian_div.find_all("div", class_="row")
52
+ for row in rows[1:]: # baris pertama adalah header
53
+ cols = row.find_all("p")
54
+ if len(cols) >= 3:
55
+ rincian = {
56
+ "pokok": cols[0].get_text(strip=True),
57
+ "denda": cols[1].get_text(strip=True),
58
+ "total": cols[2].get_text(strip=True),
59
+ }
60
+ rincian["jenis"] = cols[3].get_text(strip=True) if len(cols) > 3 else ""
61
+ rincian["jenis"] = rincian["jenis"].upper()
62
+ rincian = {k: v for k, v in rincian.items() if v}
63
+ if rincian:
64
+ rincians.append(rincian)
65
+
66
+ return data_kendaraan, rincians
67
+
68
+
69
  def get_vehicle_info(plate_number: str):
70
  # Configure headless Chrome
 
71
  options = webdriver.ChromeOptions()
72
  options.add_argument("--headless")
73
  options.add_argument("--disable-gpu")
 
76
  # Path to chromedriver (adjust if needed)
77
  driver = webdriver.Chrome(options=options)
78
 
79
+ try:
 
80
  driver.get("https://www.jambisamsat.net/infopkb.html")
81
  time.sleep(1)
82
 
 
 
83
  WebDriverWait(driver, 10).until(
84
  EC.presence_of_element_located((By.ID, "no_polisi"))
85
  )
86
 
 
87
  input_field = driver.find_element(By.ID, "no_polisi")
88
  input_field.clear()
89
  input_field.send_keys(plate_number)
90
 
 
91
  submit_button = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-primary[type="submit"]')
92
  submit_button.click()
93
 
 
96
  EC.url_contains("infopkb.php")
97
  )
98
 
 
 
 
 
 
 
 
 
 
99
  driver.implicitly_wait(3)
100
 
 
101
  scroll_height = driver.execute_script("return document.body.scrollHeight")
102
  driver.set_window_size(1920, scroll_height + 200) # force full-page height
103
  time.sleep(1)
104
 
105
+ data_kendaraan, rincian = scrape_vehicle(driver.page_source)
106
+
107
+ print(data_kendaraan, rincian)
108
+
109
  page_title = driver.title
110
  screenshot = driver.get_screenshot_as_png()
111