Rivalcoder commited on
Commit
40c134d
Β·
1 Parent(s): 71a01ff

[Edit] Update Access

Browse files
Files changed (1) hide show
  1. pdf_parser.py +15 -2
pdf_parser.py CHANGED
@@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
5
  from PIL import Image
6
  import pytesseract
7
  import imghdr
 
8
 
9
  def _extract_text(page):
10
  text = page.get_text()
@@ -19,7 +20,7 @@ def extract_text_from_image_bytes(image_bytes):
19
 
20
  def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
21
  """
22
- Download document (PDF or Image) from URL, extract text accordingly.
23
  Gracefully return fallback message if unsupported or failed.
24
  """
25
  try:
@@ -30,6 +31,18 @@ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
30
  print(f"❌ Failed to download: {str(e)}")
31
  return [f"No data found in this document (download error)"]
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Check for unsupported content
34
  if "zip" in content_type or url.endswith(".zip"):
35
  return ["No data found in this document (zip)"]
@@ -46,7 +59,7 @@ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
46
  print(f"❌ OCR failed: {str(e)}")
47
  return [f"No data found in this document (image/OCR error)"]
48
 
49
- # Try PDF fallback
50
  try:
51
  with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
52
  pages = list(doc)
 
5
  from PIL import Image
6
  import pytesseract
7
  import imghdr
8
+ from bs4 import BeautifulSoup # pip install beautifulsoup4
9
 
10
  def _extract_text(page):
11
  text = page.get_text()
 
20
 
21
  def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
22
  """
23
+ Download document (PDF, Image, or Webpage) from URL, extract text accordingly.
24
  Gracefully return fallback message if unsupported or failed.
25
  """
26
  try:
 
31
  print(f"❌ Failed to download: {str(e)}")
32
  return [f"No data found in this document (download error)"]
33
 
34
+ # Handle HTML webpages
35
+ if "text/html" in content_type or url.endswith(".html"):
36
+ print("🌐 Detected HTML page. Extracting text...")
37
+ try:
38
+ soup = BeautifulSoup(content, "html.parser")
39
+ text = soup.get_text(separator="\n")
40
+ lines = [t.strip() for t in text.splitlines() if t.strip()]
41
+ return lines if lines else ["No data found in this document (empty HTML)"]
42
+ except Exception as e:
43
+ print(f"❌ HTML parse failed: {str(e)}")
44
+ return [f"No data found in this document (HTML error)"]
45
+
46
  # Check for unsupported content
47
  if "zip" in content_type or url.endswith(".zip"):
48
  return ["No data found in this document (zip)"]
 
59
  print(f"❌ OCR failed: {str(e)}")
60
  return [f"No data found in this document (image/OCR error)"]
61
 
62
+ # Try PDF parsing
63
  try:
64
  with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
65
  pages = list(doc)