Omar ID EL MOUMEN commited on
Commit
5e9984e
·
1 Parent(s): a5f46a9

Update formattimg method

Browse files
Files changed (1) hide show
  1. app.py +2 -1
app.py CHANGED
@@ -89,7 +89,8 @@ async def extract_text_pdf(id_doc: str):
89
 
90
  postprocess_text = remove_in_betweens(pdf_text)
91
  postprocess_text = remove_punctuations(postprocess_text)
92
- postprocess_text = re.sub(r"\ +", " ", postprocess_text)
 
93
  regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
94
  titles = doc.get_toc()
95
  main_titles = []
 
89
 
90
  postprocess_text = remove_in_betweens(pdf_text)
91
  postprocess_text = remove_punctuations(postprocess_text)
92
+ postprocess_text = re.sub(r"\s+", " ", postprocess_text)
93
+ postprocess_text = postprocess_text.strip()
94
  regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
95
  titles = doc.get_toc()
96
  main_titles = []