Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
848b14f
1
Parent(s):
26ddf5d
Add page limitation for PDF url extraction
Browse files
app.py
CHANGED
@@ -58,8 +58,9 @@ class Query(BaseModel):
|
|
58 |
class DocumentID(BaseModel):
|
59 |
doc_id: str
|
60 |
|
61 |
-
class
|
62 |
url: str
|
|
|
63 |
|
64 |
@app.post("/search")
|
65 |
async def get_articles(query: Query):
|
@@ -113,11 +114,11 @@ async def extract_arxiv_pdf(document: DocumentID):
|
|
113 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
114 |
|
115 |
@app.post("/extract_pdf/url")
|
116 |
-
async def extract_pdf(pdf:
|
117 |
pdf_req = requests.get(pdf.url)
|
118 |
if pdf_req.status_code == 200:
|
119 |
pdf_data = BytesIO(pdf_req.content)
|
120 |
-
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
121 |
pdf_text = " ".join([page.get_text("text") for page in doc])
|
122 |
pdf_metadata = doc.metadata
|
123 |
print(pdf_metadata)
|
|
|
58 |
class DocumentID(BaseModel):
|
59 |
doc_id: str
|
60 |
|
61 |
+
class PDF(BaseModel):
|
62 |
url: str
|
63 |
+
page_num: str = -1
|
64 |
|
65 |
@app.post("/search")
|
66 |
async def get_articles(query: Query):
|
|
|
114 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
115 |
|
116 |
@app.post("/extract_pdf/url")
|
117 |
+
async def extract_pdf(pdf: PDF):
|
118 |
pdf_req = requests.get(pdf.url)
|
119 |
if pdf_req.status_code == 200:
|
120 |
pdf_data = BytesIO(pdf_req.content)
|
121 |
+
doc = fitz.open(stream=pdf_data, filetype="pdf")[:pdf.page_num]
|
122 |
pdf_text = " ".join([page.get_text("text") for page in doc])
|
123 |
pdf_metadata = doc.metadata
|
124 |
print(pdf_metadata)
|