Omar ID EL MOUMEN commited on
Commit
a13fabc
·
1 Parent(s): 0bf43b3

Add postprocessed text

Browse files
Files changed (1) hide show
  1. app.py +14 -9
app.py CHANGED
@@ -13,6 +13,13 @@ import re,os
13
  from io import BytesIO
14
  from datetime import datetime
15
 
 
 
 
 
 
 
 
16
 
17
  def receive_signal(signalNumber, frame):
18
  print('Received:', signalNumber)
@@ -95,14 +102,6 @@ async def extract_arxiv_pdf(document: DocumentID):
95
  if ref_pos is not None:
96
  pdf_text = pdf_text[:ref_pos - 10]
97
 
98
- def remove_in_betweens(text):
99
- removed_brackets = re.sub(r'\[.*?\]', ' ', text)
100
- removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
101
- return removed_parentheses
102
-
103
- def remove_punctuations(text):
104
- return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
105
-
106
  postprocess_text = remove_in_betweens(pdf_text)
107
  postprocess_text = remove_punctuations(postprocess_text)
108
  postprocess_text = re.sub(r"\s+", " ", postprocess_text)
@@ -137,7 +136,13 @@ async def extract_pdf(pdf: WebPDF):
137
  doc = fitz.open(stream=pdf_data, filetype="pdf")
138
  pdf_text = " ".join([page.get_text("text") for page in doc])
139
  pdf_metadata = doc.metadata
140
- return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": pdf_text}
 
 
 
 
 
 
141
  else:
142
  print("URL: " + pdf.url)
143
  print("Status code: " + str(pdf_req.status_code))
 
13
  from io import BytesIO
14
  from datetime import datetime
15
 
16
+ def remove_in_betweens(text):
17
+ removed_brackets = re.sub(r'\[.*?\]', ' ', text)
18
+ removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
19
+ return removed_parentheses
20
+
21
+ def remove_punctuations(text):
22
+ return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
23
 
24
  def receive_signal(signalNumber, frame):
25
  print('Received:', signalNumber)
 
102
  if ref_pos is not None:
103
  pdf_text = pdf_text[:ref_pos - 10]
104
 
 
 
 
 
 
 
 
 
105
  postprocess_text = remove_in_betweens(pdf_text)
106
  postprocess_text = remove_punctuations(postprocess_text)
107
  postprocess_text = re.sub(r"\s+", " ", postprocess_text)
 
136
  doc = fitz.open(stream=pdf_data, filetype="pdf")
137
  pdf_text = " ".join([page.get_text("text") for page in doc])
138
  pdf_metadata = doc.metadata
139
+ print(pdf_metadata)
140
+
141
+ postprocess_text = remove_in_betweens(pdf_text)
142
+ postprocess_text = remove_punctuations(postprocess_text)
143
+ postprocess_text = re.sub(r"\s+", " ", postprocess_text)
144
+ postprocess_text = postprocess_text.strip()
145
+ return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": postprocess_text}
146
  else:
147
  print("URL: " + pdf.url)
148
  print("Status code: " + str(pdf_req.status_code))