from fastapi import FastAPI, HTTPException, Query from pydantic import BaseModel from playwright.async_api import async_playwright import asyncio import base64 import logging from typing import List, Optional # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI(title="Playwright Web Scraper", description="A simple web scraper using Playwright") class LinkInfo(BaseModel): text: str href: str class ScrapeResponse(BaseModel): content: Optional[str] = None screenshot: Optional[str] = None links: Optional[List[LinkInfo]] = None @app.get("/") async def root(): return { "message": "Playwright Web Scraper API", "endpoints": { "/scrape": "Scrape a webpage (GET request)", "/docs": "API documentation" }, "example": "/scrape?url=https://example.com&screenshot=true&get_links=true&get_content=false" } @app.get("/scrape") async def scrape_page( url: str = Query(..., description="URL to scrape"), screenshot: bool = Query(True, description="Take a screenshot"), get_links: bool = Query(True, description="Extract links"), get_content: bool = Query(False, description="Get page content (can be large)") ): logger.info(f"Starting scrape for URL: {url}") try: async with async_playwright() as p: logger.info("Launching browser...") browser = await p.chromium.launch( headless=True, args=[ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--no-first-run', '--no-zygote', '--disable-gpu' ] ) page = await browser.new_page() try: logger.info(f"Navigating to {url}...") await page.goto(url, wait_until="networkidle") response = ScrapeResponse() # Get page content if get_content: logger.info("Getting page content...") response.content = await page.content() # Get screenshot if screenshot: logger.info("Taking screenshot...") screenshot_bytes = await page.screenshot() response.screenshot = base64.b64encode(screenshot_bytes).decode('utf-8') # Get links if get_links: logger.info("Extracting links...") links = await page.evaluate(""" () => { return Array.from(document.querySelectorAll('a')).map(a => { return { text: a.innerText.trim(), href: a.href } }); } """) response.links = [LinkInfo(**link) for link in links] await browser.close() logger.info("Scraping completed successfully") return response except Exception as e: logger.error(f"Error during scraping: {str(e)}") await browser.close() raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}") except Exception as e: logger.error(f"Error launching browser: {str(e)}") raise HTTPException(status_code=500, detail=f"Browser launch error: {str(e)}")