Omar ID EL MOUMEN commited on
Commit
a5f46a9
·
1 Parent(s): 26aea4d

Add text field to get formated text

Browse files
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -89,6 +89,7 @@ async def extract_text_pdf(id_doc: str):
89
 
90
  postprocess_text = remove_in_betweens(pdf_text)
91
  postprocess_text = remove_punctuations(postprocess_text)
 
92
  regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
93
  titles = doc.get_toc()
94
  main_titles = []
@@ -98,12 +99,12 @@ async def extract_text_pdf(id_doc: str):
98
  for title in titles:
99
  if title[0] == 1:
100
  main_titles.append(title[1])
101
- return {"message": main_titles, "pub_id": id_doc, "error": False} if len(main_titles) > 0 else {"message": f"No titles, document of {doc.page_count} pages", "pub_id": id_doc, "error": False}
102
  else:
103
  print("ID: " + id_doc)
104
  print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
105
  print("Status code: " + str(pdf_req.status_code))
106
- return {"error": True, "message": "Error while downloading PDF: " + str(pdf_req.status_code)}
107
 
108
  @app.get("/extract/random/{keyword}/{limit}")
109
  async def extract_random_pdf(keyword: str, limit: int):
 
89
 
90
  postprocess_text = remove_in_betweens(pdf_text)
91
  postprocess_text = remove_punctuations(postprocess_text)
92
+ postprocess_text = re.sub(r"\ +", " ", postprocess_text)
93
  regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
94
  titles = doc.get_toc()
95
  main_titles = []
 
99
  for title in titles:
100
  if title[0] == 1:
101
  main_titles.append(title[1])
102
+ return {"pub_id": id_doc, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": id_doc, "titles": "No titles found !", "text": postprocess_text, "error": False}
103
  else:
104
  print("ID: " + id_doc)
105
  print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
106
  print("Status code: " + str(pdf_req.status_code))
107
+ return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
108
 
109
  @app.get("/extract/random/{keyword}/{limit}")
110
  async def extract_random_pdf(keyword: str, limit: int):