Spaces:
Running
Running
Omar ID EL MOUMEN
commited on
Commit
·
a13fabc
1
Parent(s):
0bf43b3
Add postprocessed text
Browse files
app.py
CHANGED
@@ -13,6 +13,13 @@ import re,os
|
|
13 |
from io import BytesIO
|
14 |
from datetime import datetime
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def receive_signal(signalNumber, frame):
|
18 |
print('Received:', signalNumber)
|
@@ -95,14 +102,6 @@ async def extract_arxiv_pdf(document: DocumentID):
|
|
95 |
if ref_pos is not None:
|
96 |
pdf_text = pdf_text[:ref_pos - 10]
|
97 |
|
98 |
-
def remove_in_betweens(text):
|
99 |
-
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
|
100 |
-
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
|
101 |
-
return removed_parentheses
|
102 |
-
|
103 |
-
def remove_punctuations(text):
|
104 |
-
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
|
105 |
-
|
106 |
postprocess_text = remove_in_betweens(pdf_text)
|
107 |
postprocess_text = remove_punctuations(postprocess_text)
|
108 |
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
|
@@ -137,7 +136,13 @@ async def extract_pdf(pdf: WebPDF):
|
|
137 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
138 |
pdf_text = " ".join([page.get_text("text") for page in doc])
|
139 |
pdf_metadata = doc.metadata
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
else:
|
142 |
print("URL: " + pdf.url)
|
143 |
print("Status code: " + str(pdf_req.status_code))
|
|
|
13 |
from io import BytesIO
|
14 |
from datetime import datetime
|
15 |
|
16 |
+
def remove_in_betweens(text):
|
17 |
+
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
|
18 |
+
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
|
19 |
+
return removed_parentheses
|
20 |
+
|
21 |
+
def remove_punctuations(text):
|
22 |
+
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
|
23 |
|
24 |
def receive_signal(signalNumber, frame):
|
25 |
print('Received:', signalNumber)
|
|
|
102 |
if ref_pos is not None:
|
103 |
pdf_text = pdf_text[:ref_pos - 10]
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
postprocess_text = remove_in_betweens(pdf_text)
|
106 |
postprocess_text = remove_punctuations(postprocess_text)
|
107 |
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
|
|
|
136 |
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
137 |
pdf_text = " ".join([page.get_text("text") for page in doc])
|
138 |
pdf_metadata = doc.metadata
|
139 |
+
print(pdf_metadata)
|
140 |
+
|
141 |
+
postprocess_text = remove_in_betweens(pdf_text)
|
142 |
+
postprocess_text = remove_punctuations(postprocess_text)
|
143 |
+
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
|
144 |
+
postprocess_text = postprocess_text.strip()
|
145 |
+
return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": postprocess_text}
|
146 |
else:
|
147 |
print("URL: " + pdf.url)
|
148 |
print("Status code: " + str(pdf_req.status_code))
|