Omar ID EL MOUMEN commited on
Commit
848b14f
·
1 Parent(s): 26ddf5d

Add page limitation for PDF url extraction

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -58,8 +58,9 @@ class Query(BaseModel):
58
  class DocumentID(BaseModel):
59
  doc_id: str
60
 
61
- class WebPDF(BaseModel):
62
  url: str
 
63
 
64
  @app.post("/search")
65
  async def get_articles(query: Query):
@@ -113,11 +114,11 @@ async def extract_arxiv_pdf(document: DocumentID):
113
  return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
114
 
115
  @app.post("/extract_pdf/url")
116
- async def extract_pdf(pdf: WebPDF):
117
  pdf_req = requests.get(pdf.url)
118
  if pdf_req.status_code == 200:
119
  pdf_data = BytesIO(pdf_req.content)
120
- doc = fitz.open(stream=pdf_data, filetype="pdf")
121
  pdf_text = " ".join([page.get_text("text") for page in doc])
122
  pdf_metadata = doc.metadata
123
  print(pdf_metadata)
 
58
  class DocumentID(BaseModel):
59
  doc_id: str
60
 
61
+ class PDF(BaseModel):
62
  url: str
63
+ page_num: str = -1
64
 
65
  @app.post("/search")
66
  async def get_articles(query: Query):
 
114
  return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
115
 
116
  @app.post("/extract_pdf/url")
117
+ async def extract_pdf(pdf: PDF):
118
  pdf_req = requests.get(pdf.url)
119
  if pdf_req.status_code == 200:
120
  pdf_data = BytesIO(pdf_req.content)
121
+ doc = fitz.open(stream=pdf_data, filetype="pdf")[:pdf.page_num]
122
  pdf_text = " ".join([page.get_text("text") for page in doc])
123
  pdf_metadata = doc.metadata
124
  print(pdf_metadata)