jonathanjordan21 commited on
Commit
be21b28
·
verified ·
1 Parent(s): 85d5816

Update extract.py

Browse files
Files changed (1) hide show
  1. extract.py +25 -24
extract.py CHANGED
@@ -8,7 +8,6 @@ import time
8
  from selenium.webdriver.common.by import By
9
  from selenium.webdriver.support.ui import WebDriverWait
10
  from selenium.webdriver.support import expected_conditions as EC
11
- from bs4 import BeautifulSoup
12
 
13
  def take_webdata(url):
14
  options = webdriver.ChromeOptions()
@@ -34,34 +33,36 @@ def take_webdata(url):
34
  return Image.open(BytesIO(screenshot)) , page_title
35
 
36
 
37
- def scrape_vehicle(page_source):
38
- soup = BeautifulSoup(page_source, "html.parser")
39
  data_kendaraan = {}
40
- table = soup.find("table")
41
- for row in table.find_all("tr"):
42
- cells = row.find_all("td")
43
- if len(cells) >= 3:
44
- key = cells[0].get_text(strip=True).lower().replace(".", "").replace(" ", "_")
45
- value = cells[2].get_text(strip=True)
46
- data_kendaraan[key] = value
 
 
 
47
 
48
  rincians = []
49
- rincian_div = soup.find("div", id="det_pkb")
50
- if rincian_div:
51
- rows = rincian_div.find_all("div", class_="row")
52
- for row in rows[1:]: # baris pertama adalah header
53
- cols = row.find_all("p")
54
  if len(cols) >= 3:
55
  rincian = {
56
- "pokok": cols[0].get_text(strip=True),
57
- "denda": cols[1].get_text(strip=True),
58
- "total": cols[2].get_text(strip=True),
59
  }
60
- rincian["jenis"] = cols[3].get_text(strip=True) if len(cols) > 3 else ""
61
- rincian["jenis"] = rincian["jenis"].upper()
62
- rincian = {k: v for k, v in rincian.items() if v}
63
- if rincian:
64
- rincians.append(rincian)
65
 
66
  return data_kendaraan, rincians
67
 
@@ -102,7 +103,7 @@ def get_vehicle_info(plate_number: str):
102
  driver.set_window_size(1920, scroll_height + 200) # force full-page height
103
  time.sleep(1)
104
 
105
- data_kendaraan, rincian = scrape_vehicle(driver.page_source)
106
 
107
  print(data_kendaraan, rincian)
108
 
 
8
  from selenium.webdriver.common.by import By
9
  from selenium.webdriver.support.ui import WebDriverWait
10
  from selenium.webdriver.support import expected_conditions as EC
 
11
 
12
  def take_webdata(url):
13
  options = webdriver.ChromeOptions()
 
33
  return Image.open(BytesIO(screenshot)) , page_title
34
 
35
 
36
+ def scrape_vehicle(driver):
 
37
  data_kendaraan = {}
38
+ try:
39
+ rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
40
+ for row in rows:
41
+ cols = row.find_elements(By.TAG_NAME, "td")
42
+ if len(cols) >= 3:
43
+ key = cols[0].text.strip().lower().replace(".", "").replace(" ", "_")
44
+ value = cols[2].text.strip()
45
+ data_kendaraan[key] = value
46
+ except Exception as e:
47
+ print("Gagal parsing tabel:", e)
48
 
49
  rincians = []
50
+ try:
51
+ container = driver.find_element(By.ID, "det_pkb")
52
+ rows = container.find_elements(By.CLASS_NAME, "row")
53
+ for row in rows[1:]: # skip header
54
+ cols = row.find_elements(By.TAG_NAME, "p")
55
  if len(cols) >= 3:
56
  rincian = {
57
+ "pokok": cols[0].text.strip(),
58
+ "denda": cols[1].text.strip(),
59
+ "total": cols[2].text.strip(),
60
  }
61
+ if len(cols) > 3:
62
+ rincian["jenis"] = cols[3].text.strip().upper()
63
+ rincians.append(rincian)
64
+ except Exception as e:
65
+ print("Gagal parsing det_pkb:", e)
66
 
67
  return data_kendaraan, rincians
68
 
 
103
  driver.set_window_size(1920, scroll_height + 200) # force full-page height
104
  time.sleep(1)
105
 
106
+ data_kendaraan, rincian = scrape_vehicle(driver)
107
 
108
  print(data_kendaraan, rincian)
109