Spaces:

Sameercodes
/

Amazon_web_scraper

Sleeping

File size: 7,487 Bytes

887d6e4
02d6bfc

pip install selenium
import time
import random
import re
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import gradio as gr

def scrape_amazon(search_term, pincode, num_pages=5):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    driver = webdriver.Chrome(service=Service(), options=options)

    all_products = []
    seen_titles = set()

    for page in range(1, num_pages + 1):
        url = f"https://www.amazon.in/s?k={search_term}&page={page}&crid=2M096C61O4MLT&sprefix={search_term},aps,283"
        driver.get(url)

        time.sleep(random.uniform(3, 5))  # Let page load

        # Scroll down to load dynamic content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(2, 4))

        products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
        print(f"Scraping page {page}, found {len(products)} products...")

        for product in products:

            try:
                title_elem = product.find_element(By.XPATH, ".//h2//span")
                title = title_elem.text.strip()
            except:
                title = "No Title"

            if title in seen_titles:
                continue
            seen_titles.add(title)

            # Link Extraction
            try:
                link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']")
                link = link_elem.get_attribute('href')
                if link and link.startswith("/"):
                    link = "https://www.amazon.com" + link
            except:
                link = "No Link"

            # Selling Price Extraction
            try:
                price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']")
                selling_price = (price_elem.text).replace(',', '').strip()
            except:
                try:
                    price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']")
                    selling_price = price_elem.text.replace('₹', '').replace(',', '').strip()
                except:
                    selling_price = "No Price"

            try:
                mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price']//span[@class='a-offscreen']")
                mrp = mrp_elem.text.replace('₹', '').replace(',', '').strip()

            except:
                mrp = selling_price

            # Discount Extraction
            try:
                if selling_price != "No Price" and mrp != "No Price":
                    discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2)
                else:
                    discount_percent = 0.0
            except:
                discount_percent = 0.0

            # Grammage Extraction
            try:
                grammage_match = re.search(r'(\d+\.?\d*\s?(ml|g|kg|l))', title.lower())
                grammage = grammage_match.group(0) if grammage_match else "No Grammage"
            except:
                grammage = "No Grammage"

            # Deal Tags Extraction
            try:
                badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]")
                deal_tag = badge.text.strip()
            except:
                deal_tag = "No Deal"

            # Quantity Bought Extraction
            try:
                qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip()
            except:
                qty = "No data"

            # Rating Extraction
            try:
                rating_elem = product.find_element(By.XPATH, ".//span[contains(@aria-label,'out of 5 stars')]")
                rating = rating_elem.get_attribute("aria-label").split()[0]
            except:
                rating = "No Rating"

            # Reviews Extraction
            try:
                reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip()
            except:
                reviews = "No Reviews"

            # Ad / Not Ad Detection
            try:
                ad_elem = product.find_element(By.XPATH, ".//span[contains(@class, 'puis-sponsored-label-text') and contains(text(), 'Sponsored')]")
                ad_status = "Ad"
            except:
                ad_status = "Not Ad"

            # Compile product info
            product_data = {
                'Title': title,
                'Grammage': grammage,
                'Selling Price': selling_price,
                'MRP': mrp,
                'Discount %': discount_percent,
                'Deal Tags': deal_tag,
                'Quantity Bought': qty,
                'Rating': rating,
                'Reviews': reviews,
                'Link': link,
                'Ad/Not Ad': ad_status,
                'Date': datetime.now().strftime("%d-%m-%Y"),
                'Search Term': search_term,
                'Pincode': pincode,
                'Category': search_term,
            }

            all_products.append(product_data)

        time.sleep(random.uniform(2, 4))  # Pause between pages

    driver.quit()

    # Create DataFrame
    df = pd.DataFrame(all_products)

    # Save outputs
    today_date = datetime.now().strftime("%Y-%m-%d")
    filename_base = f"{search_term}scrape{today_date}"

    excel_path = f"{filename_base}.xlsx"
    csv_path = f"{filename_base}.csv"
    json_path = f"{filename_base}.json"

    df.to_excel(excel_path, index=False)
    df.to_csv(csv_path, index=False)
    df.to_json(json_path, orient="records", lines=True)

    return excel_path, csv_path, json_path


def scrape_and_return_files(product_name, pincode, num_pages):
    excel_path, csv_path, json_path = scrape_amazon(product_name, pincode, int(num_pages))
    return excel_path, csv_path, json_path


with gr.Blocks() as demo:
    gr.Markdown("## 🛒 Amazon Scraper")
    
    with gr.Row():
        product_name = gr.Textbox(label="Product Name", placeholder="e.g., atta")
        pincode = gr.Textbox(label="Pincode", placeholder="e.g., 400076")
        num_pages = gr.Number(label="Number of Pages", value=2)

    scrape_button = gr.Button("Scrape Amazon!")

    output_excel = gr.File(label="Download Excel (.xlsx)")
    output_csv = gr.File(label="Download CSV (.csv)")
    output_json = gr.File(label="Download JSON (.json)")

    scrape_button.click(
        scrape_and_return_files,
        inputs=[product_name, pincode, num_pages],
        outputs=[output_excel, output_csv, output_json]
    )

demo.launch(share=True)