Spaces:
Running
Running
Rivalcoder
commited on
Commit
Β·
40c134d
1
Parent(s):
71a01ff
[Edit] Update Access
Browse files- pdf_parser.py +15 -2
pdf_parser.py
CHANGED
@@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
5 |
from PIL import Image
|
6 |
import pytesseract
|
7 |
import imghdr
|
|
|
8 |
|
9 |
def _extract_text(page):
|
10 |
text = page.get_text()
|
@@ -19,7 +20,7 @@ def extract_text_from_image_bytes(image_bytes):
|
|
19 |
|
20 |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
21 |
"""
|
22 |
-
Download document (PDF or
|
23 |
Gracefully return fallback message if unsupported or failed.
|
24 |
"""
|
25 |
try:
|
@@ -30,6 +31,18 @@ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
|
30 |
print(f"β Failed to download: {str(e)}")
|
31 |
return [f"No data found in this document (download error)"]
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# Check for unsupported content
|
34 |
if "zip" in content_type or url.endswith(".zip"):
|
35 |
return ["No data found in this document (zip)"]
|
@@ -46,7 +59,7 @@ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
|
46 |
print(f"β OCR failed: {str(e)}")
|
47 |
return [f"No data found in this document (image/OCR error)"]
|
48 |
|
49 |
-
# Try PDF
|
50 |
try:
|
51 |
with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
|
52 |
pages = list(doc)
|
|
|
5 |
from PIL import Image
|
6 |
import pytesseract
|
7 |
import imghdr
|
8 |
+
from bs4 import BeautifulSoup # pip install beautifulsoup4
|
9 |
|
10 |
def _extract_text(page):
|
11 |
text = page.get_text()
|
|
|
20 |
|
21 |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
22 |
"""
|
23 |
+
Download document (PDF, Image, or Webpage) from URL, extract text accordingly.
|
24 |
Gracefully return fallback message if unsupported or failed.
|
25 |
"""
|
26 |
try:
|
|
|
31 |
print(f"β Failed to download: {str(e)}")
|
32 |
return [f"No data found in this document (download error)"]
|
33 |
|
34 |
+
# Handle HTML webpages
|
35 |
+
if "text/html" in content_type or url.endswith(".html"):
|
36 |
+
print("π Detected HTML page. Extracting text...")
|
37 |
+
try:
|
38 |
+
soup = BeautifulSoup(content, "html.parser")
|
39 |
+
text = soup.get_text(separator="\n")
|
40 |
+
lines = [t.strip() for t in text.splitlines() if t.strip()]
|
41 |
+
return lines if lines else ["No data found in this document (empty HTML)"]
|
42 |
+
except Exception as e:
|
43 |
+
print(f"β HTML parse failed: {str(e)}")
|
44 |
+
return [f"No data found in this document (HTML error)"]
|
45 |
+
|
46 |
# Check for unsupported content
|
47 |
if "zip" in content_type or url.endswith(".zip"):
|
48 |
return ["No data found in this document (zip)"]
|
|
|
59 |
print(f"β OCR failed: {str(e)}")
|
60 |
return [f"No data found in this document (image/OCR error)"]
|
61 |
|
62 |
+
# Try PDF parsing
|
63 |
try:
|
64 |
with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
|
65 |
pages = list(doc)
|