Sameercodes commited on
Commit
02d6bfc
·
verified ·
1 Parent(s): 0accb88

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +195 -0
app.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import random
3
+ import re
4
+ from datetime import datetime
5
+ import pandas as pd
6
+ from selenium import webdriver
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.chrome.options import Options
9
+ from selenium.webdriver.chrome.service import Service
10
+ from selenium.webdriver.support.ui import WebDriverWait
11
+ from selenium.webdriver.support import expected_conditions as EC
12
+ import gradio as gr
13
+
14
+ def scrape_amazon(search_term, pincode, num_pages=5):
15
+ options = Options()
16
+ options.add_argument('--headless')
17
+ options.add_argument('--disable-blink-features=AutomationControlled')
18
+ options.add_argument('--disable-gpu')
19
+ options.add_argument('--no-sandbox')
20
+
21
+ driver = webdriver.Chrome(service=Service(), options=options)
22
+
23
+ all_products = []
24
+ seen_titles = set()
25
+
26
+ for page in range(1, num_pages + 1):
27
+ url = f"https://www.amazon.in/s?k={search_term}&page={page}&crid=2M096C61O4MLT&sprefix={search_term},aps,283"
28
+ driver.get(url)
29
+
30
+ time.sleep(random.uniform(3, 5)) # Let page load
31
+
32
+ # Scroll down to load dynamic content
33
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
34
+ time.sleep(random.uniform(2, 4))
35
+
36
+ products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
37
+ print(f"Scraping page {page}, found {len(products)} products...")
38
+
39
+ for product in products:
40
+
41
+ try:
42
+ title_elem = product.find_element(By.XPATH, ".//h2//span")
43
+ title = title_elem.text.strip()
44
+ except:
45
+ title = "No Title"
46
+
47
+ if title in seen_titles:
48
+ continue
49
+ seen_titles.add(title)
50
+
51
+ # Link Extraction
52
+ try:
53
+ link_elem = product.find_element(By.XPATH, ".//a[@class='a-link-normal s-no-outline']")
54
+ link = link_elem.get_attribute('href')
55
+ if link and link.startswith("/"):
56
+ link = "https://www.amazon.com" + link
57
+ except:
58
+ link = "No Link"
59
+
60
+ # Selling Price Extraction
61
+ try:
62
+ price_elem = product.find_element(By.XPATH, ".//span[@class='a-price-whole']")
63
+ selling_price = (price_elem.text).replace(',', '').strip()
64
+ except:
65
+ try:
66
+ price_elem = product.find_element(By.XPATH, ".//span[@class='a-offscreen']")
67
+ selling_price = price_elem.text.replace('₹', '').replace(',', '').strip()
68
+ except:
69
+ selling_price = "No Price"
70
+
71
+ try:
72
+ mrp_elem = product.find_element(By.XPATH, ".//span[@class='a-price a-text-price']//span[@class='a-offscreen']")
73
+ mrp = mrp_elem.text.replace('₹', '').replace(',', '').strip()
74
+
75
+ except:
76
+ mrp = selling_price
77
+
78
+ # Discount Extraction
79
+ try:
80
+ if selling_price != "No Price" and mrp != "No Price":
81
+ discount_percent = round(100 * (float(mrp) - float(selling_price)) / float(mrp), 2)
82
+ else:
83
+ discount_percent = 0.0
84
+ except:
85
+ discount_percent = 0.0
86
+
87
+ # Grammage Extraction
88
+ try:
89
+ grammage_match = re.search(r'(\d+\.?\d*\s?(ml|g|kg|l))', title.lower())
90
+ grammage = grammage_match.group(0) if grammage_match else "No Grammage"
91
+ except:
92
+ grammage = "No Grammage"
93
+
94
+ # Deal Tags Extraction
95
+ try:
96
+ badge = product.find_element(By.XPATH, ".//div[contains(@class, 'a-color-secondary')]//span[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'deal') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'coupon') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'save') or contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'limited')]")
97
+ deal_tag = badge.text.strip()
98
+ except:
99
+ deal_tag = "No Deal"
100
+
101
+ # Quantity Bought Extraction
102
+ try:
103
+ qty = product.find_element(By.XPATH, ".//span[contains(text(),'bought in past month')]").text.strip()
104
+ except:
105
+ qty = "No data"
106
+
107
+ # Rating Extraction
108
+ try:
109
+ rating_elem = product.find_element(By.XPATH, ".//span[contains(@aria-label,'out of 5 stars')]")
110
+ rating = rating_elem.get_attribute("aria-label").split()[0]
111
+ except:
112
+ rating = "No Rating"
113
+
114
+ # Reviews Extraction
115
+ try:
116
+ reviews = product.find_element(By.XPATH, ".//a[contains(@aria-label,'ratings')]/span").text.strip()
117
+ except:
118
+ reviews = "No Reviews"
119
+
120
+ # Ad / Not Ad Detection
121
+ try:
122
+ ad_elem = product.find_element(By.XPATH, ".//span[contains(@class, 'puis-sponsored-label-text') and contains(text(), 'Sponsored')]")
123
+ ad_status = "Ad"
124
+ except:
125
+ ad_status = "Not Ad"
126
+
127
+ # Compile product info
128
+ product_data = {
129
+ 'Title': title,
130
+ 'Grammage': grammage,
131
+ 'Selling Price': selling_price,
132
+ 'MRP': mrp,
133
+ 'Discount %': discount_percent,
134
+ 'Deal Tags': deal_tag,
135
+ 'Quantity Bought': qty,
136
+ 'Rating': rating,
137
+ 'Reviews': reviews,
138
+ 'Link': link,
139
+ 'Ad/Not Ad': ad_status,
140
+ 'Date': datetime.now().strftime("%d-%m-%Y"),
141
+ 'Search Term': search_term,
142
+ 'Pincode': pincode,
143
+ 'Category': search_term,
144
+ }
145
+
146
+ all_products.append(product_data)
147
+
148
+ time.sleep(random.uniform(2, 4)) # Pause between pages
149
+
150
+ driver.quit()
151
+
152
+ # Create DataFrame
153
+ df = pd.DataFrame(all_products)
154
+
155
+ # Save outputs
156
+ today_date = datetime.now().strftime("%Y-%m-%d")
157
+ filename_base = f"{search_term}scrape{today_date}"
158
+
159
+ excel_path = f"{filename_base}.xlsx"
160
+ csv_path = f"{filename_base}.csv"
161
+ json_path = f"{filename_base}.json"
162
+
163
+ df.to_excel(excel_path, index=False)
164
+ df.to_csv(csv_path, index=False)
165
+ df.to_json(json_path, orient="records", lines=True)
166
+
167
+ return excel_path, csv_path, json_path
168
+
169
+
170
+ def scrape_and_return_files(product_name, pincode, num_pages):
171
+ excel_path, csv_path, json_path = scrape_amazon(product_name, pincode, int(num_pages))
172
+ return excel_path, csv_path, json_path
173
+
174
+
175
+ with gr.Blocks() as demo:
176
+ gr.Markdown("## 🛒 Amazon Scraper")
177
+
178
+ with gr.Row():
179
+ product_name = gr.Textbox(label="Product Name", placeholder="e.g., atta")
180
+ pincode = gr.Textbox(label="Pincode", placeholder="e.g., 400076")
181
+ num_pages = gr.Number(label="Number of Pages", value=2)
182
+
183
+ scrape_button = gr.Button("Scrape Amazon!")
184
+
185
+ output_excel = gr.File(label="Download Excel (.xlsx)")
186
+ output_csv = gr.File(label="Download CSV (.csv)")
187
+ output_json = gr.File(label="Download JSON (.json)")
188
+
189
+ scrape_button.click(
190
+ scrape_and_return_files,
191
+ inputs=[product_name, pincode, num_pages],
192
+ outputs=[output_excel, output_csv, output_json]
193
+ )
194
+
195
+ demo.launch(share=True)