Spaces:

apexherbert200
/

playwright-scraper-clean

Paused

File size: 3,016 Bytes

# from fastapi import FastAPI
# from playwright.async_api import async_playwright, TimeoutError
# import re

# app = FastAPI()

# async def scrape_google(query: str):
#     url = f"https://www.google.com/search?q={query}"
#     async with async_playwright() as pw:
#         browser = await pw.chromium.launch(headless=True)
#         context = await browser.new_context()
#         page = await context.new_page()

#         await page.goto(url, wait_until="domcontentloaded", timeout=60000)
#         try:
#             await page.wait_for_selector("div#search", timeout=10000)
#         except TimeoutError:
#             pass

#         links = []
#         for h in await page.query_selector_all("h3"):
#             try:
#                 a = await h.evaluate_handle("e => e.closest('a')")
#                 href = await a.get_attribute("href")
#                 title = await h.inner_text()
#                 links.append({"title": title, "link": href})
#             except:
#                 continue

#         results = []
#         for item in links[:5]:
#             await page.goto(item["link"], wait_until="domcontentloaded", timeout=30000)
#             html = await page.content()
#             emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", html)
#             phones = re.findall(r"\+?\d[\d\s\-/]{7,}\d", html)
#             results.append({
#                 **item,
#                 "emails": list(set(emails))[:2],
#                 "phones": list(set(phones))[:2]
#             })

#         await browser.close()
#     return results

# @app.get("/search")
# async def search(query: str):
#     data = await scrape_google(query.replace(" ", "+"))
#     return {"query": query, "results": data}







from fastapi import FastAPI
from playwright.async_api import async_playwright, TimeoutError

app = FastAPI()

async def scrape_full_page(url: str):
    async with async_playwright() as pw:
        browser = await pw.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        
        await page.goto(url, wait_until="domcontentloaded", timeout=60000)
        try:
            await page.wait_for_selector("body", timeout=10000)
        except TimeoutError:
            pass

        html = await page.content()
        # Extract headings & paragraphs as structured JSON
        items = await page.evaluate("""
        () => {
            const data = [];
            document.querySelectorAll('h1,h2,h3,h4,h5,h6,p').forEach(el => {
                data.push({ tag: el.tagName.toLowerCase(), text: el.innerText.trim() });
            });
            return data;
        }
        """)
        await browser.close()
    return {"html": html, "content": items}

@app.get("/scrape")
async def scrape(url: str):
    """
    Fetches the full page and returns:
    - raw HTML
    - an array of objects: { tag: 'h1'|'p'|..., text: '...' }
    """
    result = await scrape_full_page(url)
    return result