Game4all commited on
Commit
21275ec
·
1 Parent(s): 66641c2

Implement arXiv backend

Browse files
Files changed (4) hide show
  1. app.py +19 -2
  2. docs/docs.md +1 -0
  3. requirements.txt +2 -1
  4. serp.py +35 -0
app.py CHANGED
@@ -10,7 +10,7 @@ import logging
10
  import uvicorn
11
 
12
  from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
13
- from serp import SerpQuery, SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
14
  from utils import log_gathered_exceptions
15
 
16
  logging.basicConfig(
@@ -68,6 +68,23 @@ async def search_google_scholar(params: SerpQuery):
68
  return SerpResults(results=flattened_results, error=None)
69
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  @serp_router.post("/search_patents")
72
  async def search_patents(params: SerpQuery) -> SerpResults:
73
  """Searches google patents for the specified queries and returns the found documents."""
@@ -215,4 +232,4 @@ async def scrap_patents(params: ScrapPatentsRequest) -> PatentScrapBulkResponse:
215
  app.include_router(serp_router)
216
  app.include_router(scrap_router)
217
 
218
- uvicorn.run(app, host="127.0.0.1", port=7860)
 
10
  import uvicorn
11
 
12
  from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
13
+ from serp import SerpQuery, SerpResults, query_arxiv, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
14
  from utils import log_gathered_exceptions
15
 
16
  logging.basicConfig(
 
68
  return SerpResults(results=flattened_results, error=None)
69
 
70
 
71
+ @serp_router.post("/search_arxiv")
72
+ async def search_arxiv(params: SerpQuery):
73
+ """Searches arxiv for the specified queries and returns the found documents."""
74
+ logging.info(f"Searching Arxiv for queries: {params.queries}")
75
+ results = await asyncio.gather(*[query_arxiv(httpx_client, q, params.n_results) for q in params.queries], return_exceptions=True)
76
+ log_gathered_exceptions(results, "arxiv search", params)
77
+
78
+ filtered_results = [r for r in results if not isinstance(r, Exception)]
79
+ flattened_results = [
80
+ item for sublist in filtered_results for item in sublist]
81
+
82
+ if len(filtered_results) == 0:
83
+ return SerpResults(results=[], error=str(results[-1]))
84
+
85
+ return SerpResults(results=flattened_results, error=None)
86
+
87
+
88
  @serp_router.post("/search_patents")
89
  async def search_patents(params: SerpQuery) -> SerpResults:
90
  """Searches google patents for the specified queries and returns the found documents."""
 
232
  app.include_router(serp_router)
233
  app.include_router(scrap_router)
234
 
235
+ uvicorn.run(app, host="0.0.0.0", port=7860)
docs/docs.md CHANGED
@@ -8,6 +8,7 @@ SERPent exposes an unified API to query SERP (Search Engine Result Pages) for a
8
  - Brave
9
  - Bing
10
  - Google Patents
 
11
  - Google
12
 
13
  The application uses the `playwright` library to control a headless web browser, to simulate normal user activity, to fool the anti-bot measures often present on those sites. See the `/serp/` endpoints for search results scrapping.
 
8
  - Brave
9
  - Bing
10
  - Google Patents
11
+ - arXiv
12
  - Google
13
 
14
  The application uses the `playwright` library to control a headless web browser, to simulate normal user activity, to fool the anti-bot measures often present on those sites. See the `/serp/` endpoints for search results scrapping.
requirements.txt CHANGED
@@ -4,4 +4,5 @@ pydantic
4
  playwright
5
  duckduckgo_search
6
  beautifulsoup4
7
- httpx
 
 
4
  playwright
5
  duckduckgo_search
6
  beautifulsoup4
7
+ httpx
8
+ lxml
serp.py CHANGED
@@ -1,11 +1,13 @@
1
  from contextlib import asynccontextmanager
2
  from typing import Optional
3
  from duckduckgo_search import DDGS
 
4
  from pydantic import BaseModel, Field
5
  from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
6
  from urllib.parse import quote_plus
7
  import logging
8
  import re
 
9
  from asyncio import Semaphore
10
 
11
  # Concurrency limit for Playwright browser contexts.
@@ -243,3 +245,36 @@ async def query_ddg_search(q: str, n_results: int = 10):
243
  {"title": result["title"], "body": result["body"], "href": result["href"]})
244
 
245
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from contextlib import asynccontextmanager
2
  from typing import Optional
3
  from duckduckgo_search import DDGS
4
+ import httpx
5
  from pydantic import BaseModel, Field
6
  from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
7
  from urllib.parse import quote_plus
8
  import logging
9
  import re
10
+ from lxml import etree
11
  from asyncio import Semaphore
12
 
13
  # Concurrency limit for Playwright browser contexts.
 
245
  {"title": result["title"], "body": result["body"], "href": result["href"]})
246
 
247
  return results
248
+
249
+
250
+ async def query_arxiv(client: httpx.AsyncClient, query: str, max_results: int = 3):
251
+ """Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
252
+ ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
253
+ ARXIV_API_URL = 'https://export.arxiv.org/api/query?'
254
+
255
+ search_params = {
256
+ 'search_query': query,
257
+ 'start': 0,
258
+ 'max_results': max_results
259
+ }
260
+ query_url = ARXIV_API_URL
261
+
262
+ response = await client.get(query_url, params=search_params)
263
+ response.raise_for_status()
264
+
265
+ root = etree.fromstring(response.content)
266
+ entries = root.findall('atom:entry', ATOM_NAMESPACE)
267
+
268
+ results = []
269
+ for entry in entries:
270
+ title = entry.find(
271
+ 'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
272
+ id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
273
+ pdf_url = entry.find(
274
+ 'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
275
+ summary = entry.find(
276
+ 'atom:summary', ATOM_NAMESPACE).text.strip()
277
+ results.append({'title': title, 'href': pdf_url,
278
+ 'body': summary, 'id': id})
279
+
280
+ return results