Spaces:

Rivalcoder
/

Issurance_Agent_Rag

Running

App Files Files Community

Rivalcoder commited on 2 days ago

Commit

40c134d

1 Parent(s): 71a01ff

[Edit] Update Access

Browse files

Files changed (1) hide show

pdf_parser.py +15 -2

pdf_parser.py CHANGED Viewed

@@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
 from PIL import Image
 import pytesseract
 import imghdr
 def _extract_text(page):
     text = page.get_text()
@@ -19,7 +20,7 @@ def extract_text_from_image_bytes(image_bytes):
 def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
     """
-    Download document (PDF or Image) from URL, extract text accordingly.
     Gracefully return fallback message if unsupported or failed.
     """
     try:
@@ -30,6 +31,18 @@ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
         print(f"❌ Failed to download: {str(e)}")
         return [f"No data found in this document (download error)"]
     # Check for unsupported content
     if "zip" in content_type or url.endswith(".zip"):
         return ["No data found in this document (zip)"]
@@ -46,7 +59,7 @@ def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
             print(f"❌ OCR failed: {str(e)}")
             return [f"No data found in this document (image/OCR error)"]
-    # Try PDF fallback
     try:
         with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
             pages = list(doc)

 from PIL import Image
 import pytesseract
 import imghdr
+from bs4 import BeautifulSoup  # pip install beautifulsoup4
 def _extract_text(page):
     text = page.get_text()
 def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
     """
+    Download document (PDF, Image, or Webpage) from URL, extract text accordingly.
     Gracefully return fallback message if unsupported or failed.
     """
     try:
         print(f"❌ Failed to download: {str(e)}")
         return [f"No data found in this document (download error)"]
+    # Handle HTML webpages
+    if "text/html" in content_type or url.endswith(".html"):
+        print("🌐 Detected HTML page. Extracting text...")
+        try:
+            soup = BeautifulSoup(content, "html.parser")
+            text = soup.get_text(separator="\n")
+            lines = [t.strip() for t in text.splitlines() if t.strip()]
+            return lines if lines else ["No data found in this document (empty HTML)"]
+        except Exception as e:
+            print(f"❌ HTML parse failed: {str(e)}")
+            return [f"No data found in this document (HTML error)"]
     # Check for unsupported content
     if "zip" in content_type or url.endswith(".zip"):
         return ["No data found in this document (zip)"]
             print(f"❌ OCR failed: {str(e)}")
             return [f"No data found in this document (image/OCR error)"]
+    # Try PDF parsing
     try:
         with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
             pages = list(doc)