Spaces:
Running
Running
File size: 4,316 Bytes
0d10b91 7d0e6b0 40c134d 0d10b91 7acce36 0d10b91 7acce36 0d10b91 7d0e6b0 7acce36 40c134d 7d0e6b0 7acce36 7d0e6b0 40c134d 7d0e6b0 40c134d 7d0e6b0 0d10b91 7acce36 7d0e6b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import fitz # PyMuPDF
import requests
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
import pytesseract
import imghdr
from bs4 import BeautifulSoup # pip install beautifulsoup4
def _extract_text(page):
text = page.get_text()
return text.strip() if text and text.strip() else None
def is_image(content):
return imghdr.what(None, h=content) in ["jpeg", "png", "bmp", "gif", "tiff", "webp"]
def extract_text_from_image_bytes(image_bytes):
image = Image.open(BytesIO(image_bytes))
return pytesseract.image_to_string(image).strip()
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
"""
Download document (PDF, Image, or Webpage) from URL, extract text accordingly.
Gracefully return fallback message if unsupported or failed.
"""
try:
res = requests.get(url)
content = res.content
content_type = res.headers.get("content-type", "").lower()
except Exception as e:
print(f"β Failed to download: {str(e)}")
return [f"No data found in this document (download error)"]
# Handle HTML webpages
if "text/html" in content_type or url.endswith(".html"):
print("π Detected HTML page. Extracting text...")
try:
soup = BeautifulSoup(content, "html.parser")
text = soup.get_text(separator="\n")
lines = [t.strip() for t in text.splitlines() if t.strip()]
return lines if lines else ["No data found in this document (empty HTML)"]
except Exception as e:
print(f"β HTML parse failed: {str(e)}")
return [f"No data found in this document (HTML error)"]
# Check for unsupported content
if "zip" in content_type or url.endswith(".zip"):
return ["No data found in this document (zip)"]
if "octet-stream" in content_type or url.endswith(".bin"):
return ["No data found in this document (bin)"]
# OCR for image files
if "image" in content_type or is_image(content):
print("π· Detected image file. Using OCR...")
try:
text = extract_text_from_image_bytes(content)
return [text] if text else ["No data found in this document (image empty)"]
except Exception as e:
print(f"β OCR failed: {str(e)}")
return [f"No data found in this document (image/OCR error)"]
# Try PDF parsing
try:
with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
pages = list(doc)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
texts = list(executor.map(_extract_text, pages))
if chunk_size > 1:
chunks = []
for i in range(0, len(texts), chunk_size):
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
if chunk:
chunks.append(chunk)
return chunks if chunks else ["No data found in this document (empty PDF)"]
return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
except Exception as e:
print(f"β Failed to parse as PDF: {str(e)}")
return [f"No data found in this document (not PDF or corrupted)"]
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
"""
Parse a local PDF file, extract text in parallel, optionally chunk pages.
"""
try:
with fitz.open(file_path) as doc:
pages = list(doc)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
texts = list(executor.map(_extract_text, pages))
if chunk_size > 1:
chunks = []
for i in range(0, len(texts), chunk_size):
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
if chunk:
chunks.append(chunk)
return chunks if chunks else ["No data found in this document (local PDF empty)"]
return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
except Exception as e:
print(f"β Failed to open local file: {str(e)}")
return [f"No data found in this document (local file error)"]
|