selenium_web_scrape

Running

App Files Files Community

jonathanjordan21 commited on May 15

Commit

be21b28

verified ·

1 Parent(s): 85d5816

Update extract.py

Browse files

Files changed (1) hide show

extract.py +25 -24

extract.py CHANGED Viewed

@@ -8,7 +8,6 @@ import time
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
-from bs4 import BeautifulSoup
 def take_webdata(url):
     options = webdriver.ChromeOptions()
@@ -34,34 +33,36 @@ def take_webdata(url):
     return Image.open(BytesIO(screenshot)) , page_title
-def scrape_vehicle(page_source):
-    soup = BeautifulSoup(page_source, "html.parser")
     data_kendaraan = {}
-    table = soup.find("table")
-    for row in table.find_all("tr"):
-        cells = row.find_all("td")
-        if len(cells) >= 3:
-            key = cells[0].get_text(strip=True).lower().replace(".", "").replace(" ", "_")
-            value = cells[2].get_text(strip=True)
-            data_kendaraan[key] = value
     rincians = []
-    rincian_div = soup.find("div", id="det_pkb")
-    if rincian_div:
-        rows = rincian_div.find_all("div", class_="row")
-        for row in rows[1:]:  # baris pertama adalah header
-            cols = row.find_all("p")
             if len(cols) >= 3:
                 rincian = {
-                    "pokok": cols[0].get_text(strip=True),
-                    "denda": cols[1].get_text(strip=True),
-                    "total": cols[2].get_text(strip=True),
                 }
-                rincian["jenis"] = cols[3].get_text(strip=True) if len(cols) > 3 else ""
-                rincian["jenis"] = rincian["jenis"].upper()
-                rincian = {k: v for k, v in rincian.items() if v}
-                if rincian:
-                    rincians.append(rincian)
     return data_kendaraan, rincians
@@ -102,7 +103,7 @@ def get_vehicle_info(plate_number: str):
         driver.set_window_size(1920, scroll_height + 200)  # force full-page height
         time.sleep(1)
-        data_kendaraan, rincian = scrape_vehicle(driver.page_source)
         print(data_kendaraan, rincian)

 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 def take_webdata(url):
     options = webdriver.ChromeOptions()
     return Image.open(BytesIO(screenshot)) , page_title
+def scrape_vehicle(driver):
     data_kendaraan = {}
+    try:
+        rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
+        for row in rows:
+            cols = row.find_elements(By.TAG_NAME, "td")
+            if len(cols) >= 3:
+                key = cols[0].text.strip().lower().replace(".", "").replace(" ", "_")
+                value = cols[2].text.strip()
+                data_kendaraan[key] = value
+    except Exception as e:
+        print("Gagal parsing tabel:", e)
     rincians = []
+    try:
+        container = driver.find_element(By.ID, "det_pkb")
+        rows = container.find_elements(By.CLASS_NAME, "row")
+        for row in rows[1:]:  # skip header
+            cols = row.find_elements(By.TAG_NAME, "p")
             if len(cols) >= 3:
                 rincian = {
+                    "pokok": cols[0].text.strip(),
+                    "denda": cols[1].text.strip(),
+                    "total": cols[2].text.strip(),
                 }
+                if len(cols) > 3:
+                    rincian["jenis"] = cols[3].text.strip().upper()
+                rincians.append(rincian)
+    except Exception as e:
+        print("Gagal parsing det_pkb:", e)
     return data_kendaraan, rincians
         driver.set_window_size(1920, scroll_height + 200)  # force full-page height
         time.sleep(1)
+        data_kendaraan, rincian = scrape_vehicle(driver)
         print(data_kendaraan, rincian)