Spaces:
Running
Running
Omar ID EL MOUMEN
commited on
Commit
·
a5f46a9
1
Parent(s):
26aea4d
Add text field to get formated text
Browse files
app.py
CHANGED
@@ -89,6 +89,7 @@ async def extract_text_pdf(id_doc: str):
|
|
89 |
|
90 |
postprocess_text = remove_in_betweens(pdf_text)
|
91 |
postprocess_text = remove_punctuations(postprocess_text)
|
|
|
92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
93 |
titles = doc.get_toc()
|
94 |
main_titles = []
|
@@ -98,12 +99,12 @@ async def extract_text_pdf(id_doc: str):
|
|
98 |
for title in titles:
|
99 |
if title[0] == 1:
|
100 |
main_titles.append(title[1])
|
101 |
-
return {"
|
102 |
else:
|
103 |
print("ID: " + id_doc)
|
104 |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
|
105 |
print("Status code: " + str(pdf_req.status_code))
|
106 |
-
return {"error": True, "message": "Error while downloading PDF: " + str(pdf_req.status_code)}
|
107 |
|
108 |
@app.get("/extract/random/{keyword}/{limit}")
|
109 |
async def extract_random_pdf(keyword: str, limit: int):
|
|
|
89 |
|
90 |
postprocess_text = remove_in_betweens(pdf_text)
|
91 |
postprocess_text = remove_punctuations(postprocess_text)
|
92 |
+
postprocess_text = re.sub(r"\ +", " ", postprocess_text)
|
93 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
94 |
titles = doc.get_toc()
|
95 |
main_titles = []
|
|
|
99 |
for title in titles:
|
100 |
if title[0] == 1:
|
101 |
main_titles.append(title[1])
|
102 |
+
return {"pub_id": id_doc, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": id_doc, "titles": "No titles found !", "text": postprocess_text, "error": False}
|
103 |
else:
|
104 |
print("ID: " + id_doc)
|
105 |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
|
106 |
print("Status code: " + str(pdf_req.status_code))
|
107 |
+
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
108 |
|
109 |
@app.get("/extract/random/{keyword}/{limit}")
|
110 |
async def extract_random_pdf(keyword: str, limit: int):
|