Spaces:
Running
Running
Omar ID EL MOUMEN
commited on
Commit
·
5e9984e
1
Parent(s):
a5f46a9
Update formattimg method
Browse files
app.py
CHANGED
@@ -89,7 +89,8 @@ async def extract_text_pdf(id_doc: str):
|
|
89 |
|
90 |
postprocess_text = remove_in_betweens(pdf_text)
|
91 |
postprocess_text = remove_punctuations(postprocess_text)
|
92 |
-
postprocess_text = re.sub(r"\
|
|
|
93 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
94 |
titles = doc.get_toc()
|
95 |
main_titles = []
|
|
|
89 |
|
90 |
postprocess_text = remove_in_betweens(pdf_text)
|
91 |
postprocess_text = remove_punctuations(postprocess_text)
|
92 |
+
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
|
93 |
+
postprocess_text = postprocess_text.strip()
|
94 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
95 |
titles = doc.get_toc()
|
96 |
main_titles = []
|