Game4all commited on
Commit
66641c2
·
1 Parent(s): 15fdde6

Make scrapping async

Browse files
Files changed (4) hide show
  1. app.py +81 -71
  2. scrap.py +72 -60
  3. serp.py +39 -27
  4. utils.py +9 -0
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from contextlib import asynccontextmanager
2
  from typing import Optional
3
  from fastapi import APIRouter, FastAPI
@@ -8,8 +9,9 @@ from playwright.async_api import async_playwright, Browser, BrowserContext, Page
8
  import logging
9
  import uvicorn
10
 
11
- from scrap import scrap_patent_async, scrap_patent_bulk_async
12
- from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
 
13
 
14
  logging.basicConfig(
15
  level=logging.INFO,
@@ -47,95 +49,99 @@ serp_router = APIRouter(prefix="/serp", tags=["serp scrapping"])
47
  # ===================== Search endpoints =====================
48
 
49
 
50
- class SerpQuery(BaseModel):
51
- queries: list[str] = Field(...,
52
- description="The list of queries to search for")
53
- n_results: int = Field(
54
- 10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
55
-
56
-
57
  @serp_router.post("/search_scholar")
58
  async def search_google_scholar(params: SerpQuery):
59
  """Queries google scholar for the specified query"""
60
- results = []
61
- for q in params.queries:
62
- logging.info(f"Searching Google Scholar with query `{q}`")
63
- try:
64
- res = await query_google_scholar(pw_browser, q, params.n_results)
65
- results.extend(res)
66
- except Exception as e:
67
- logging.error(
68
- f"Failed to query Google Scholar with query `{q}`: {e}")
69
- return SerpResults(results=results, error=None)
 
 
 
 
70
 
71
 
72
  @serp_router.post("/search_patents")
73
  async def search_patents(params: SerpQuery) -> SerpResults:
74
  """Searches google patents for the specified queries and returns the found documents."""
75
- results = []
76
- for q in params.queries:
77
- logging.info(f"Searching Google Patents with query `{q}`")
78
- try:
79
- res = await query_google_patents(pw_browser, q, params.n_results)
80
- results.extend(res)
81
- except Exception as e:
82
- logging.error(
83
- f"Failed to query Google Patents with query `{q}`: {e}")
84
- return SerpResults(results=results, error=None)
 
 
 
 
85
 
86
 
87
  @serp_router.post("/search_brave")
88
  async def search_brave(params: SerpQuery) -> SerpResults:
89
  """Searches brave search for the specified queries and returns the found documents."""
90
- results = []
91
- last_exception: Optional[Exception] = None
92
- for q in params.queries:
93
- logging.info(f"Searching Brave search with query `{q}`")
94
- try:
95
- res = await query_brave_search(pw_browser, q, params.n_results)
96
- results.extend(res)
97
- except Exception as e:
98
- last_exception = e
99
- logging.error(
100
- f"Failed to query Brave search with query `{q}`: {e}")
101
 
102
- return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
 
 
 
 
103
 
104
 
105
  @serp_router.post("/search_bing")
106
  async def search_bing(params: SerpQuery) -> SerpResults:
107
  """Searches Bing search for the specified queries and returns the found documents."""
108
- results = []
109
- last_exception: Optional[Exception] = None
110
- for q in params.queries:
111
- logging.info(f"Searching Bing search with query `{q}`")
112
- try:
113
- res = await query_bing_search(pw_browser, q, params.n_results)
114
- results.extend(res)
115
- except Exception as e:
116
- last_exception = e
117
- logging.error(
118
- f"Failed to query Bing search with query `{q}`: {e}")
119
 
120
- return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
 
 
 
 
121
 
122
 
123
  @serp_router.post("/search_duck")
124
  async def search_duck(params: SerpQuery) -> SerpResults:
125
  """Searches duckduckgo for the specified queries and returns the found documents"""
126
- results = []
127
- last_exception: Optional[Exception] = None
 
128
 
129
- for q in params.queries:
130
- logging.info(f"Querying DDG with query: `{q}`")
131
- try:
132
- res = await query_ddg_search(q, params.n_results)
133
- results.extend(res)
134
- except Exception as e:
135
- last_exception = e
136
- logging.error(f"Failed to query DDG with query `{q}`: {e}")
137
 
138
- return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
139
 
140
 
141
  @serp_router.post("/search")
@@ -180,11 +186,16 @@ async def search(params: SerpQuery):
180
  # =========================== Scrapping endpoints ===========================
181
 
182
 
 
183
  @scrap_router.get("/scrap_patent/{patent_id}")
184
  async def scrap_patent(patent_id: str):
185
  """Scraps the specified patent from Google Patents."""
186
- patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
187
- return patent
 
 
 
 
188
 
189
 
190
  class ScrapPatentsRequest(BaseModel):
@@ -193,11 +204,10 @@ class ScrapPatentsRequest(BaseModel):
193
  description="List of patent IDs to scrap")
194
 
195
 
196
- @scrap_router.post("/scrap_patents_bulk")
197
- async def scrap_patents(params: ScrapPatentsRequest):
198
  """Scraps multiple patents from Google Patents."""
199
- patents = await scrap_patent_bulk_async(httpx_client, [
200
- f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids])
201
  return patents
202
 
203
  # ===============================================================================
@@ -205,4 +215,4 @@ async def scrap_patents(params: ScrapPatentsRequest):
205
  app.include_router(serp_router)
206
  app.include_router(scrap_router)
207
 
208
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ import asyncio
2
  from contextlib import asynccontextmanager
3
  from typing import Optional
4
  from fastapi import APIRouter, FastAPI
 
9
  import logging
10
  import uvicorn
11
 
12
+ from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
13
+ from serp import SerpQuery, SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
14
+ from utils import log_gathered_exceptions
15
 
16
  logging.basicConfig(
17
  level=logging.INFO,
 
49
  # ===================== Search endpoints =====================
50
 
51
 
 
 
 
 
 
 
 
52
  @serp_router.post("/search_scholar")
53
  async def search_google_scholar(params: SerpQuery):
54
  """Queries google scholar for the specified query"""
55
+ logging.info(f"Searching Google Scholar for queries: {params.queries}")
56
+ results = await asyncio.gather(*[query_google_scholar(pw_browser, q, params.n_results) for q in params.queries], return_exceptions=True)
57
+ log_gathered_exceptions(results, "google scholar search", params)
58
+
59
+ # Filter out exceptions and flatten the results
60
+ filtered_results = [r for r in results if not isinstance(r, Exception)]
61
+ flattened_results = [
62
+ item for sublist in filtered_results for item in sublist]
63
+
64
+ # all queries failed, return the last exception
65
+ if len(filtered_results) == 0:
66
+ return SerpResults(results=[], error=str(results[-1]))
67
+
68
+ return SerpResults(results=flattened_results, error=None)
69
 
70
 
71
  @serp_router.post("/search_patents")
72
  async def search_patents(params: SerpQuery) -> SerpResults:
73
  """Searches google patents for the specified queries and returns the found documents."""
74
+ logging.info(f"Searching Google Patents for queries: {params.queries}")
75
+ results = await asyncio.gather(*[query_google_patents(pw_browser, q, params.n_results) for q in params.queries], return_exceptions=True)
76
+ log_gathered_exceptions(results, "google patent search", params)
77
+
78
+ # Filter out exceptions and flatten the results
79
+ filtered_results = [r for r in results if not isinstance(r, Exception)]
80
+ flattened_results = [
81
+ item for sublist in filtered_results for item in sublist]
82
+
83
+ # all queries failed, return the last exception
84
+ if len(filtered_results) == 0:
85
+ return SerpResults(results=[], error=str(results[-1]))
86
+
87
+ return SerpResults(results=flattened_results, error=None)
88
 
89
 
90
  @serp_router.post("/search_brave")
91
  async def search_brave(params: SerpQuery) -> SerpResults:
92
  """Searches brave search for the specified queries and returns the found documents."""
93
+ logging.info(f"Searching Brave Search for queries: {params.queries}")
94
+ results = await asyncio.gather(*[query_brave_search(pw_browser, q, params.n_results) for q in params.queries], return_exceptions=True)
95
+ log_gathered_exceptions(results, "brave search", params)
96
+
97
+ # Filter out exceptions and flatten the results
98
+ filtered_results = [r for r in results if not isinstance(r, Exception)]
99
+ flattened_results = [
100
+ item for sublist in filtered_results for item in sublist]
 
 
 
101
 
102
+ # all queries failed, return the last exception
103
+ if len(filtered_results) == 0:
104
+ return SerpResults(results=[], error=str(results[-1]))
105
+
106
+ return SerpResults(results=flattened_results, error=None)
107
 
108
 
109
  @serp_router.post("/search_bing")
110
  async def search_bing(params: SerpQuery) -> SerpResults:
111
  """Searches Bing search for the specified queries and returns the found documents."""
112
+ logging.info(f"Searching Bing Search for queries: {params.queries}")
113
+ results = await asyncio.gather(*[query_bing_search(pw_browser, q, params.n_results) for q in params.queries], return_exceptions=True)
114
+ log_gathered_exceptions(results, "bing search", params)
115
+
116
+ # Filter out exceptions and flatten the results
117
+ filtered_results = [r for r in results if not isinstance(r, Exception)]
118
+ flattened_results = [
119
+ item for sublist in filtered_results for item in sublist]
 
 
 
120
 
121
+ # all queries failed, return the last exception
122
+ if len(filtered_results) == 0:
123
+ return SerpResults(results=[], error=str(results[-1]))
124
+
125
+ return SerpResults(results=flattened_results, error=None)
126
 
127
 
128
  @serp_router.post("/search_duck")
129
  async def search_duck(params: SerpQuery) -> SerpResults:
130
  """Searches duckduckgo for the specified queries and returns the found documents"""
131
+ logging.info(f"Searching DuckDuckGo for queries: {params.queries}")
132
+ results = await asyncio.gather(*[query_ddg_search(q, params.n_results) for q in params.queries], return_exceptions=True)
133
+ log_gathered_exceptions(results, "duckduckgo search", params)
134
 
135
+ # Filter out exceptions and flatten the results
136
+ filtered_results = [r for r in results if not isinstance(r, Exception)]
137
+ flattened_results = [
138
+ item for sublist in filtered_results for item in sublist]
139
+
140
+ # all queries failed, return the last exception
141
+ if len(filtered_results) == 0:
142
+ return SerpResults(results=[], error=str(results[-1]))
143
 
144
+ return SerpResults(results=flattened_results, error=None)
145
 
146
 
147
  @serp_router.post("/search")
 
186
  # =========================== Scrapping endpoints ===========================
187
 
188
 
189
+ # TODO: return a proper error response if the patent is not found or scrapping fails
190
  @scrap_router.get("/scrap_patent/{patent_id}")
191
  async def scrap_patent(patent_id: str):
192
  """Scraps the specified patent from Google Patents."""
193
+ try:
194
+ patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
195
+ return patent
196
+ except Exception as e:
197
+ logging.warning(f"Failed to scrap patent {patent_id}: {e}")
198
+ return None
199
 
200
 
201
  class ScrapPatentsRequest(BaseModel):
 
204
  description="List of patent IDs to scrap")
205
 
206
 
207
+ @scrap_router.post("/scrap_patents_bulk", response_model=PatentScrapBulkResponse)
208
+ async def scrap_patents(params: ScrapPatentsRequest) -> PatentScrapBulkResponse:
209
  """Scraps multiple patents from Google Patents."""
210
+ patents = await scrap_patent_bulk_async(httpx_client, params.patent_ids)
 
211
  return patents
212
 
213
  # ===============================================================================
 
215
  app.include_router(serp_router)
216
  app.include_router(scrap_router)
217
 
218
+ uvicorn.run(app, host="127.0.0.1", port=7860)
scrap.py CHANGED
@@ -27,65 +27,77 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
27
  headers = {
28
  "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
29
  }
30
- try:
31
- response = await client.get(patent_url, headers=headers)
32
- response.raise_for_status()
33
-
34
- soup = BeautifulSoup(response.text, "html.parser")
35
-
36
- # Abstract
37
- abstract_div = soup.find("div", {"class": "abstract"})
38
- abstract = abstract_div.get_text(
39
- strip=True) if abstract_div else None
40
-
41
- # Description
42
- description_section = soup.find("section", itemprop="description")
43
- description = description_section.get_text(
44
- separator="\n", strip=True) if description_section else None
45
-
46
- # Field of the Invention
47
- invention_field_match = re.findall(
48
- r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
49
- invention_field = invention_field_match[0][1].strip(
50
- ) if invention_field_match else None
51
-
52
- # Background of the Invention
53
- invention_background_match = re.findall(
54
- r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
55
- invention_background = invention_background_match[0][1].strip(
56
- ) if invention_background_match else None
57
-
58
- # Claims
59
- claims_section = soup.find("section", itemprop="claims")
60
- claims = claims_section.get_text(
61
- separator="\n", strip=True) if claims_section else None
62
-
63
- # Patent Title
64
- meta_title = soup.find("meta", {"name": "DC.title"}).get(
65
- "content").strip()
66
-
67
- # Patent publication number
68
- # pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
69
- # get the h2 with id ="pubnum" and extract the text
70
-
71
- return PatentScrapResult(
72
- # publication_number=pub_num,
73
- abstract=abstract,
74
- description=description,
75
- claims=claims,
76
- title=meta_title,
77
- field_of_invention=invention_field,
78
- background=invention_background
79
- )
80
- except Exception as e:
81
- logging.error(f"Error scraping {patent_url}: {e}")
82
- return None
83
-
84
-
85
- async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
 
 
86
  """Scrape multiple patents asynchronously."""
87
- tasks = [scrap_patent_async(client, url) for url in patent_urls]
88
- results = await asyncio.gather(*tasks)
 
89
 
90
- # Filter out None results (failed scrapes)
91
- return [res for res in results if res is not None]
 
 
 
 
 
 
 
 
 
 
27
  headers = {
28
  "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
29
  }
30
+ response = await client.get(patent_url, headers=headers)
31
+ response.raise_for_status()
32
+
33
+ soup = BeautifulSoup(response.text, "html.parser")
34
+
35
+ # Abstract
36
+ abstract_div = soup.find("div", {"class": "abstract"})
37
+ abstract = abstract_div.get_text(
38
+ strip=True) if abstract_div else None
39
+
40
+ # Description
41
+ description_section = soup.find("section", itemprop="description")
42
+ description = description_section.get_text(
43
+ separator="\n", strip=True) if description_section else None
44
+
45
+ # Field of the Invention
46
+ invention_field_match = re.findall(
47
+ r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
48
+ invention_field = invention_field_match[0][1].strip(
49
+ ) if invention_field_match else None
50
+
51
+ # Background of the Invention
52
+ invention_background_match = re.findall(
53
+ r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
54
+ invention_background = invention_background_match[0][1].strip(
55
+ ) if invention_background_match else None
56
+
57
+ # Claims
58
+ claims_section = soup.find("section", itemprop="claims")
59
+ claims = claims_section.get_text(
60
+ separator="\n", strip=True) if claims_section else None
61
+
62
+ # Patent Title
63
+ meta_title = soup.find("meta", {"name": "DC.title"}).get(
64
+ "content").strip()
65
+
66
+ # Patent publication number
67
+ # pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
68
+ # get the h2 with id ="pubnum" and extract the text
69
+
70
+ return PatentScrapResult(
71
+ # publication_number=pub_num,
72
+ abstract=abstract,
73
+ description=description,
74
+ claims=claims,
75
+ title=meta_title,
76
+ field_of_invention=invention_field,
77
+ background=invention_background
78
+ )
79
+
80
+
81
+ class PatentScrapBulkResponse(BaseModel):
82
+ """Response model for bulk patent scraping."""
83
+ patents: list[PatentScrapResult]
84
+ failed_ids: list[str]
85
+
86
+
87
+ async def scrap_patent_bulk_async(client: AsyncClient, patent_ids: list[int]) -> PatentScrapBulkResponse:
88
  """Scrape multiple patents asynchronously."""
89
+ urls = [
90
+ f"https://patents.google.com/patent/{pid}/en" for pid in patent_ids]
91
+ results = await asyncio.gather(*[scrap_patent_async(client, url) for url in urls], return_exceptions=True)
92
 
93
+ filtered_results = [
94
+ res for res in results if not isinstance(res, Exception)]
95
+
96
+ failed_ids = [
97
+ patent_ids[i] for i, res in enumerate(results) if isinstance(res, Exception)
98
+ ]
99
+
100
+ return PatentScrapBulkResponse(
101
+ patents=filtered_results,
102
+ failed_ids=failed_ids
103
+ )
serp.py CHANGED
@@ -1,11 +1,24 @@
1
  from contextlib import asynccontextmanager
2
  from typing import Optional
3
  from duckduckgo_search import DDGS
4
- from pydantic import BaseModel
5
  from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
6
  from urllib.parse import quote_plus
7
  import logging
8
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  class SerpResults(BaseModel):
11
  """Model for SERP scrapping results"""
@@ -21,16 +34,20 @@ class BraveSearchBlockedException(Exception):
21
  pass
22
 
23
 
 
 
 
24
  @asynccontextmanager
25
  async def playwright_open_page(browser: Browser):
26
  """Context manager for playwright pages"""
27
- context: BrowserContext = await browser.new_context()
28
- page: Page = await context.new_page()
29
- try:
30
- yield page
31
- finally:
32
- await page.close()
33
- await context.close()
 
34
 
35
 
36
  async def query_google_scholar(browser: Browser, q: str, n_results: int = 10):
@@ -145,28 +162,23 @@ async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
145
 
146
  results = []
147
 
148
- try:
149
- for result in results_cards:
150
- title = await result.locator('.title').all_inner_texts()
151
- description = await result.locator('.snippet-description').all_inner_texts()
152
- url = await result.locator('a').nth(0).get_attribute('href')
153
-
154
- # Filter out results with no URL or brave-specific URLs
155
- if url is None or url.startswith('/'):
156
- continue
157
 
158
- results.append({
159
- "title": title[0] if title else "",
160
- "body": description[0] if description else "",
161
- "href": url
162
- })
163
 
164
- if len(results) >= n_results:
165
- break
 
 
 
166
 
167
- except TimeoutError as e:
168
- logging.warning(
169
- f"Timeout on selector while parsing Brave Search SERP: {e}")
170
 
171
  return results
172
 
 
1
  from contextlib import asynccontextmanager
2
  from typing import Optional
3
  from duckduckgo_search import DDGS
4
+ from pydantic import BaseModel, Field
5
  from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
6
  from urllib.parse import quote_plus
7
  import logging
8
  import re
9
+ from asyncio import Semaphore
10
+
11
+ # Concurrency limit for Playwright browser contexts.
12
+ # This is to prevent too many concurrent browser contexts from being created,
13
+ PLAYWRIGHT_CONCURRENCY_LIMIT = 10
14
+
15
+
16
+ class SerpQuery(BaseModel):
17
+ queries: list[str] = Field(...,
18
+ description="The list of queries to search for")
19
+ n_results: int = Field(
20
+ 10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
21
+
22
 
23
  class SerpResults(BaseModel):
24
  """Model for SERP scrapping results"""
 
34
  pass
35
 
36
 
37
+ _PLAYWRIGHT_CONCURRENCY_SEMAPHORE = Semaphore(PLAYWRIGHT_CONCURRENCY_LIMIT)
38
+
39
+
40
  @asynccontextmanager
41
  async def playwright_open_page(browser: Browser):
42
  """Context manager for playwright pages"""
43
+ async with _PLAYWRIGHT_CONCURRENCY_SEMAPHORE:
44
+ context: BrowserContext = await browser.new_context()
45
+ page: Page = await context.new_page()
46
+ try:
47
+ yield page
48
+ finally:
49
+ await page.close()
50
+ await context.close()
51
 
52
 
53
  async def query_google_scholar(browser: Browser, q: str, n_results: int = 10):
 
162
 
163
  results = []
164
 
165
+ for result in results_cards:
166
+ title = await result.locator('.title').all_inner_texts()
167
+ description = await result.locator('.snippet-description').all_inner_texts()
168
+ url = await result.locator('a').nth(0).get_attribute('href')
 
 
 
 
 
169
 
170
+ # Filter out results with no URL or brave-specific URLs
171
+ if url is None or url.startswith('/'):
172
+ continue
 
 
173
 
174
+ results.append({
175
+ "title": title[0] if title else "",
176
+ "body": description[0] if description else "",
177
+ "href": url
178
+ })
179
 
180
+ if len(results) >= n_results:
181
+ break
 
182
 
183
  return results
184
 
utils.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from asyncio.log import logger
2
+ from serp import SerpQuery
3
+
4
+
5
+ def log_gathered_exceptions(results: list, context: str, params: SerpQuery):
6
+ """Logs gathered exceptions with context and parameters."""
7
+ for exc, q in zip(results, params.queries):
8
+ if isinstance(exc, Exception):
9
+ logger.warning(f"Error during {context} for query '{q}': {exc}")