# from fastapi import FastAPI | |
# from playwright.async_api import async_playwright, TimeoutError | |
# import re | |
# app = FastAPI() | |
# async def scrape_google(query: str): | |
# url = f"https://www.google.com/search?q={query}" | |
# async with async_playwright() as pw: | |
# browser = await pw.chromium.launch(headless=True) | |
# context = await browser.new_context() | |
# page = await context.new_page() | |
# await page.goto(url, wait_until="domcontentloaded", timeout=60000) | |
# try: | |
# await page.wait_for_selector("div#search", timeout=10000) | |
# except TimeoutError: | |
# pass | |
# links = [] | |
# for h in await page.query_selector_all("h3"): | |
# try: | |
# a = await h.evaluate_handle("e => e.closest('a')") | |
# href = await a.get_attribute("href") | |
# title = await h.inner_text() | |
# links.append({"title": title, "link": href}) | |
# except: | |
# continue | |
# results = [] | |
# for item in links[:5]: | |
# await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000) | |
# html = await page.content() | |
# emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html) | |
# phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html) | |
# results.append({ | |
# **item, | |
# "emails": list(set(emails))[:2], | |
# "phones": list(set(phones))[:2] | |
# }) | |
# await browser.close() | |
# return results | |
# @app.get("/search") | |
# async def search(query: str): | |
# data = await scrape_google(query.replace(" ", "+")) | |
# return {"query": query, "results": data} | |
from fastapi import FastAPI | |
from playwright.async_api import async_playwright, TimeoutError | |
app = FastAPI() | |
async def scrape_full_page(url: str): | |
async with async_playwright() as pw: | |
browser = await pw.chromium.launch(headless=True) | |
context = await browser.new_context() | |
page = await context.new_page() | |
await page.goto(url, wait_until="domcontentloaded", timeout=60000) | |
try: | |
await page.wait_for_selector("body", timeout=10000) | |
except TimeoutError: | |
pass | |
html = await page.content() | |
# Extract headings & paragraphs as structured JSON | |
items = await page.evaluate(""" | |
() => { | |
const data = []; | |
document.querySelectorAll('h1,h2,h3,h4,h5,h6,p').forEach(el => { | |
data.push({ tag: el.tagName.toLowerCase(), text: el.innerText.trim() }); | |
}); | |
return data; | |
} | |
""") | |
await browser.close() | |
return {"html": html, "content": items} | |
async def scrape(url: str): | |
""" | |
Fetches the full page and returns: | |
- raw HTML | |
- an array of objects: { tag: 'h1'|'p'|..., text: '...' } | |
""" | |
result = await scrape_full_page(url) | |
return result | |