Spaces:

Sameercodes
/

Amazon_web_scraper

Sleeping

App Files Files Community

Amazon_web_scraper / app.py

Sameercodes

Update app.py

887d6e4 verified 4 months ago

raw

history blame

7.49 kB

	pip install selenium
	import time
	import random
	import re
	from datetime import datetime
	import pandas as pd
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	import gradio as gr

	def scrape_amazon(search_term, pincode, num_pages=5):
	options = Options()
	options.add_argument('--headless')
	options.add_argument('--disable-blink-features=AutomationControlled')
	options.add_argument('--disable-gpu')
	options.add_argument('--no-sandbox')

	driver = webdriver.Chrome(service=Service(), options=options)

	all_products = []
	seen_titles = set()

	for page in range(1, num_pages + 1):
	url = f"https://www.amazon.in/s?k={search_term}&page={page}&crid=2M096C61O4MLT&sprefix={search_term},aps,283"
	driver.get(url)

	time.sleep(random.uniform(3, 5)) # Let page load

	# Scroll down to load dynamic content
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(random.uniform(2, 4))

	products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
	print(f"Scraping page {page}, found {len(products)} products...")

	for product in products:

	try:
	title_elem = product.find_element(By.XPATH, ".//h2//span")
	title = title_elem.text.strip()
	except:
	title = "No Title"

	if title in seen_titles:
	continue
	seen_titles.add(title)

	# Link Extraction
	try:
	link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']")
	link = link_elem.get_attribute('href')
	if link and link.startswith("/"):
	link = "https://www.amazon.com" + link
	except:
	link = "No Link"

	# Selling Price Extraction
	try:
	price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']")
	selling_price = (price_elem.text).replace(',', '').strip()
	except:
	try:
	price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']")
	selling_price = price_elem.text.replace('₹', '').replace(',', '').strip()
	except:
	selling_price = "No Price"

	try:
	mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price']//span[@class='a-offscreen']")
	mrp = mrp_elem.text.replace('₹', '').replace(',', '').strip()

	except:
	mrp = selling_price

	# Discount Extraction
	try:
	if selling_price != "No Price" and mrp != "No Price":
	discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2)
	else:
	discount_percent = 0.0
	except:
	discount_percent = 0.0

	# Grammage Extraction
	try:
	grammage_match = re.search(r'(\d+\.?\d*\s?(ml\|g\|kg\|l))', title.lower())
	grammage = grammage_match.group(0) if grammage_match else "No Grammage"
	except:
	grammage = "No Grammage"

	# Deal Tags Extraction
	try:
	badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]")
	deal_tag = badge.text.strip()
	except:
	deal_tag = "No Deal"

	# Quantity Bought Extraction
	try:
	qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip()
	except:
	qty = "No data"

	# Rating Extraction
	try:
	rating_elem = product.find_element(By.XPATH, ".//span[contains(@aria-label,'out of 5 stars')]")
	rating = rating_elem.get_attribute("aria-label").split()[0]
	except:
	rating = "No Rating"

	# Reviews Extraction
	try:
	reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip()
	except:
	reviews = "No Reviews"

	# Ad / Not Ad Detection
	try:
	ad_elem = product.find_element(By.XPATH, ".//span[contains(@class, 'puis-sponsored-label-text') and contains(text(), 'Sponsored')]")
	ad_status = "Ad"
	except:
	ad_status = "Not Ad"

	# Compile product info
	product_data = {
	'Title': title,
	'Grammage': grammage,
	'Selling Price': selling_price,
	'MRP': mrp,
	'Discount %': discount_percent,
	'Deal Tags': deal_tag,
	'Quantity Bought': qty,
	'Rating': rating,
	'Reviews': reviews,
	'Link': link,
	'Ad/Not Ad': ad_status,
	'Date': datetime.now().strftime("%d-%m-%Y"),
	'Search Term': search_term,
	'Pincode': pincode,
	'Category': search_term,
	}

	all_products.append(product_data)

	time.sleep(random.uniform(2, 4)) # Pause between pages

	driver.quit()

	# Create DataFrame
	df = pd.DataFrame(all_products)

	# Save outputs
	today_date = datetime.now().strftime("%Y-%m-%d")
	filename_base = f"{search_term}scrape{today_date}"

	excel_path = f"{filename_base}.xlsx"
	csv_path = f"{filename_base}.csv"
	json_path = f"{filename_base}.json"

	df.to_excel(excel_path, index=False)
	df.to_csv(csv_path, index=False)
	df.to_json(json_path, orient="records", lines=True)

	return excel_path, csv_path, json_path


	def scrape_and_return_files(product_name, pincode, num_pages):
	excel_path, csv_path, json_path = scrape_amazon(product_name, pincode, int(num_pages))
	return excel_path, csv_path, json_path


	with gr.Blocks() as demo:
	gr.Markdown("## 🛒 Amazon Scraper")

	with gr.Row():
	product_name = gr.Textbox(label="Product Name", placeholder="e.g., atta")
	pincode = gr.Textbox(label="Pincode", placeholder="e.g., 400076")
	num_pages = gr.Number(label="Number of Pages", value=2)

	scrape_button = gr.Button("Scrape Amazon!")

	output_excel = gr.File(label="Download Excel (.xlsx)")
	output_csv = gr.File(label="Download CSV (.csv)")
	output_json = gr.File(label="Download JSON (.json)")

	scrape_button.click(
	scrape_and_return_files,
	inputs=[product_name, pincode, num_pages],
	outputs=[output_excel, output_csv, output_json]
	)

	demo.launch(share=True)