Spaces:
Running
Running
Implement arXiv backend
Browse files- app.py +19 -2
- docs/docs.md +1 -0
- requirements.txt +2 -1
- serp.py +35 -0
app.py
CHANGED
@@ -10,7 +10,7 @@ import logging
|
|
10 |
import uvicorn
|
11 |
|
12 |
from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
|
13 |
-
from serp import SerpQuery, SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
|
14 |
from utils import log_gathered_exceptions
|
15 |
|
16 |
logging.basicConfig(
|
@@ -68,6 +68,23 @@ async def search_google_scholar(params: SerpQuery):
|
|
68 |
return SerpResults(results=flattened_results, error=None)
|
69 |
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
@serp_router.post("/search_patents")
|
72 |
async def search_patents(params: SerpQuery) -> SerpResults:
|
73 |
"""Searches google patents for the specified queries and returns the found documents."""
|
@@ -215,4 +232,4 @@ async def scrap_patents(params: ScrapPatentsRequest) -> PatentScrapBulkResponse:
|
|
215 |
app.include_router(serp_router)
|
216 |
app.include_router(scrap_router)
|
217 |
|
218 |
-
uvicorn.run(app, host="
|
|
|
10 |
import uvicorn
|
11 |
|
12 |
from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
|
13 |
+
from serp import SerpQuery, SerpResults, query_arxiv, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
|
14 |
from utils import log_gathered_exceptions
|
15 |
|
16 |
logging.basicConfig(
|
|
|
68 |
return SerpResults(results=flattened_results, error=None)
|
69 |
|
70 |
|
71 |
+
@serp_router.post("/search_arxiv")
|
72 |
+
async def search_arxiv(params: SerpQuery):
|
73 |
+
"""Searches arxiv for the specified queries and returns the found documents."""
|
74 |
+
logging.info(f"Searching Arxiv for queries: {params.queries}")
|
75 |
+
results = await asyncio.gather(*[query_arxiv(httpx_client, q, params.n_results) for q in params.queries], return_exceptions=True)
|
76 |
+
log_gathered_exceptions(results, "arxiv search", params)
|
77 |
+
|
78 |
+
filtered_results = [r for r in results if not isinstance(r, Exception)]
|
79 |
+
flattened_results = [
|
80 |
+
item for sublist in filtered_results for item in sublist]
|
81 |
+
|
82 |
+
if len(filtered_results) == 0:
|
83 |
+
return SerpResults(results=[], error=str(results[-1]))
|
84 |
+
|
85 |
+
return SerpResults(results=flattened_results, error=None)
|
86 |
+
|
87 |
+
|
88 |
@serp_router.post("/search_patents")
|
89 |
async def search_patents(params: SerpQuery) -> SerpResults:
|
90 |
"""Searches google patents for the specified queries and returns the found documents."""
|
|
|
232 |
app.include_router(serp_router)
|
233 |
app.include_router(scrap_router)
|
234 |
|
235 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
docs/docs.md
CHANGED
@@ -8,6 +8,7 @@ SERPent exposes an unified API to query SERP (Search Engine Result Pages) for a
|
|
8 |
- Brave
|
9 |
- Bing
|
10 |
- Google Patents
|
|
|
11 |
- Google
|
12 |
|
13 |
The application uses the `playwright` library to control a headless web browser, to simulate normal user activity, to fool the anti-bot measures often present on those sites. See the `/serp/` endpoints for search results scrapping.
|
|
|
8 |
- Brave
|
9 |
- Bing
|
10 |
- Google Patents
|
11 |
+
- arXiv
|
12 |
- Google
|
13 |
|
14 |
The application uses the `playwright` library to control a headless web browser, to simulate normal user activity, to fool the anti-bot measures often present on those sites. See the `/serp/` endpoints for search results scrapping.
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ pydantic
|
|
4 |
playwright
|
5 |
duckduckgo_search
|
6 |
beautifulsoup4
|
7 |
-
httpx
|
|
|
|
4 |
playwright
|
5 |
duckduckgo_search
|
6 |
beautifulsoup4
|
7 |
+
httpx
|
8 |
+
lxml
|
serp.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
from contextlib import asynccontextmanager
|
2 |
from typing import Optional
|
3 |
from duckduckgo_search import DDGS
|
|
|
4 |
from pydantic import BaseModel, Field
|
5 |
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
|
6 |
from urllib.parse import quote_plus
|
7 |
import logging
|
8 |
import re
|
|
|
9 |
from asyncio import Semaphore
|
10 |
|
11 |
# Concurrency limit for Playwright browser contexts.
|
@@ -243,3 +245,36 @@ async def query_ddg_search(q: str, n_results: int = 10):
|
|
243 |
{"title": result["title"], "body": result["body"], "href": result["href"]})
|
244 |
|
245 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from contextlib import asynccontextmanager
|
2 |
from typing import Optional
|
3 |
from duckduckgo_search import DDGS
|
4 |
+
import httpx
|
5 |
from pydantic import BaseModel, Field
|
6 |
from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
|
7 |
from urllib.parse import quote_plus
|
8 |
import logging
|
9 |
import re
|
10 |
+
from lxml import etree
|
11 |
from asyncio import Semaphore
|
12 |
|
13 |
# Concurrency limit for Playwright browser contexts.
|
|
|
245 |
{"title": result["title"], "body": result["body"], "href": result["href"]})
|
246 |
|
247 |
return results
|
248 |
+
|
249 |
+
|
250 |
+
async def query_arxiv(client: httpx.AsyncClient, query: str, max_results: int = 3):
|
251 |
+
"""Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
|
252 |
+
ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
|
253 |
+
ARXIV_API_URL = 'https://export.arxiv.org/api/query?'
|
254 |
+
|
255 |
+
search_params = {
|
256 |
+
'search_query': query,
|
257 |
+
'start': 0,
|
258 |
+
'max_results': max_results
|
259 |
+
}
|
260 |
+
query_url = ARXIV_API_URL
|
261 |
+
|
262 |
+
response = await client.get(query_url, params=search_params)
|
263 |
+
response.raise_for_status()
|
264 |
+
|
265 |
+
root = etree.fromstring(response.content)
|
266 |
+
entries = root.findall('atom:entry', ATOM_NAMESPACE)
|
267 |
+
|
268 |
+
results = []
|
269 |
+
for entry in entries:
|
270 |
+
title = entry.find(
|
271 |
+
'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
|
272 |
+
id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
|
273 |
+
pdf_url = entry.find(
|
274 |
+
'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
|
275 |
+
summary = entry.find(
|
276 |
+
'atom:summary', ATOM_NAMESPACE).text.strip()
|
277 |
+
results.append({'title': title, 'href': pdf_url,
|
278 |
+
'body': summary, 'id': id})
|
279 |
+
|
280 |
+
return results
|