SERPent

Running

App Files Files Community

SERPent / scrap.py

Game4all

Make scrapping async

66641c2 about 14 hours ago

raw

history blame contribute delete

3.89 kB

	import asyncio
	import logging
	import re
	from typing import Optional
	from httpx import AsyncClient
	from bs4 import BeautifulSoup
	from pydantic import BaseModel


	class PatentScrapResult(BaseModel):
	"""Schema for the result of scraping a google patents page."""
	# The title of the patent.
	title: str
	# The abstract of the patent, if available.
	abstract: Optional[str] = None
	# The full description of the patent containing the field of the invention, background, summary, etc.
	description: Optional[str] = None
	# The full claims of the patent.
	claims: Optional[str] = None
	# The field of the invention, if available.
	field_of_invention: Optional[str] = None
	# The background of the invention, if available.
	background: Optional[str] = None


	async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
	headers = {
	"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
	}
	response = await client.get(patent_url, headers=headers)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, "html.parser")

	# Abstract
	abstract_div = soup.find("div", {"class": "abstract"})
	abstract = abstract_div.get_text(
	strip=True) if abstract_div else None

	# Description
	description_section = soup.find("section", itemprop="description")
	description = description_section.get_text(
	separator="\n", strip=True) if description_section else None

	# Field of the Invention
	invention_field_match = re.findall(
	r"(FIELD OF THE INVENTION\|TECHNICAL FIELD)(.*?)(?:(BACKGROUND\|BACKGROUND OF THE INVENTION\|SUMMARY\|BRIEF SUMMARY\|DETAILED DESCRIPTION\|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE \| re.DOTALL) if description_section else None
	invention_field = invention_field_match[0][1].strip(
	) if invention_field_match else None

	# Background of the Invention
	invention_background_match = re.findall(
	r"(BACKGROUND OF THE INVENTION\|BACKGROUND)(.*?)(?:(SUMMARY\|BRIEF SUMMARY\|DETAILED DESCRIPTION\|DESCRIPTION OF THE PREFERRED EMBODIMENTS\|DESCRIPTION))", description, re.IGNORECASE \| re.DOTALL) if description_section else None
	invention_background = invention_background_match[0][1].strip(
	) if invention_background_match else None

	# Claims
	claims_section = soup.find("section", itemprop="claims")
	claims = claims_section.get_text(
	separator="\n", strip=True) if claims_section else None

	# Patent Title
	meta_title = soup.find("meta", {"name": "DC.title"}).get(
	"content").strip()

	# Patent publication number
	# pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
	# get the h2 with id ="pubnum" and extract the text

	return PatentScrapResult(
	# publication_number=pub_num,
	abstract=abstract,
	description=description,
	claims=claims,
	title=meta_title,
	field_of_invention=invention_field,
	background=invention_background
	)


	class PatentScrapBulkResponse(BaseModel):
	"""Response model for bulk patent scraping."""
	patents: list[PatentScrapResult]
	failed_ids: list[str]


	async def scrap_patent_bulk_async(client: AsyncClient, patent_ids: list[int]) -> PatentScrapBulkResponse:
	"""Scrape multiple patents asynchronously."""
	urls = [
	f"https://patents.google.com/patent/{pid}/en" for pid in patent_ids]
	results = await asyncio.gather(*[scrap_patent_async(client, url) for url in urls], return_exceptions=True)

	filtered_results = [
	res for res in results if not isinstance(res, Exception)]

	failed_ids = [
	patent_ids[i] for i, res in enumerate(results) if isinstance(res, Exception)
	]

	return PatentScrapBulkResponse(
	patents=filtered_results,
	failed_ids=failed_ids
	)