import time import random import re from datetime import datetime import pandas as pd import gradio as gr from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service def scrape_amazon(search_term, pincode, num_pages=5): options = Options() options.add_argument('--headless') options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') driver = webdriver.Chrome(service=Service(), options=options) all_products = [] seen_titles = set() for page in range(1, num_pages + 1): url = f"https://www.amazon.in/s?k={search_term}&page={page}&crid=2M096C61O4MLT&sprefix={search_term},aps,283" driver.get(url) time.sleep(random.uniform(3, 5)) # Let page load # Scroll down to load dynamic content driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(random.uniform(2, 4)) products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']") print(f"Scraping page {page}, found {len(products)} products...") for product in products: try: title_elem = product.find_element(By.XPATH, ".//h2//span") title = title_elem.text.strip() except: title = "No Title" if title in seen_titles: continue seen_titles.add(title) try: link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']") link = link_elem.get_attribute('href') except: link = "No Link" try: price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']") selling_price = price_elem.text.replace(',', '').strip() except: try: price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']") selling_price = price_elem.text.replace('₹', '').replace(',', '').strip() except: selling_price = "No Price" try: mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price' and @data-a-strike='true']//span[@class='a-offscreen']") raw_price = mrp_elem.get_attribute("textContent") mrp = raw_price.replace('₹', '').replace(',', '').strip() except: mrp = "No Price" try: if selling_price != "No Price" and mrp != "No Price": discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2) else: discount_percent = 0.0 except: discount_percent = 0.0 try: grammage_match = re.search(r'(\d+\.?\d*\s?(ml|g|kg|l))', title.lower()) grammage = grammage_match.group(0) if grammage_match else "No Grammage" except: grammage = "No Grammage" try: badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]") deal_tag = badge.text.strip() except: deal_tag = "No Deal" try: qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip() except: qty = "No data" try: rating_elem = product.find_element(By.XPATH, ".//span[@class='a-icon-alt']") rating = rating_elem.get_attribute("textContent").split()[0] except: rating = "No Rating" try: reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip() except: reviews = "No Reviews" try: ad_elem = product.find_element(By.XPATH, ".//span[contains(@class, 'a-color-secondary') and contains(text(), 'Sponsored')]") ad_status = "Ad" except: ad_status = "Not Ad" product_data = { 'Title': title, 'Grammage': grammage, 'Selling Price': selling_price, 'MRP': mrp, 'Discount %': discount_percent, 'Deal Tags': deal_tag, 'Quantity Bought': qty, 'Rating': rating, 'Reviews': reviews, 'Link': link, 'Ad/Not Ad': ad_status, 'Date': datetime.now().strftime("%d-%m-%Y"), 'Search Term': search_term, 'Pincode': pincode, 'Category': search_term, } all_products.append(product_data) time.sleep(random.uniform(2, 4)) # Pause between pages driver.quit() df = pd.DataFrame(all_products) today_date = datetime.now().strftime("%Y-%m-%d") filename_base = f"{search_term}_scrape_{today_date}" excel_path = f"{filename_base}.xlsx" csv_path = f"{filename_base}.csv" json_path = f"{filename_base}.json" df.to_excel(excel_path, index=False) df.to_csv(csv_path, index=False) df.to_json(json_path, orient="records", lines=True) return excel_path, csv_path, json_path, df ### Now the Gradio interface def gradio_interface(search_term, pincode, num_pages): excel_path, csv_path, json_path, df = scrape_amazon(search_term, pincode, int(num_pages)) return df, excel_path, csv_path, json_path # Gradio App app = gr.Interface( fn=gradio_interface, inputs=[ gr.Textbox(label="Search Term"), gr.Textbox(label="Pincode"), gr.Slider(minimum=1, maximum=10, step=1, value=2, label="Number of Pages to Scrape") ], outputs=[ gr.Dataframe(label="Scraped Data"), gr.File(label="Excel File"), gr.File(label="CSV File"), gr.File(label="JSON File"), ], title="🛒 Amazon.in Product Scraper", description="Enter a search term, pincode, and number of pages. Download the results as Excel/CSV/JSON.", ) if __name__ == "__main__": app.launch(share=True)