Spaces:
Running
Running
import fitz # PyMuPDF | |
import requests | |
from io import BytesIO | |
from concurrent.futures import ThreadPoolExecutor | |
import os | |
def extract_page_text(page): | |
text = page.get_text() | |
return text if text.strip() else None | |
def parse_pdf_from_url_multithreaded(url, max_workers=None): | |
# Automatically detect and use all available CPU cores if max_workers not set | |
if max_workers is None: | |
max_workers = os.cpu_count() or 8 | |
res = requests.get(url) | |
doc = fitz.open(stream=BytesIO(res.content), filetype="pdf") | |
pages = [page for page in doc] | |
chunks = [None] * len(pages) | |
# Process pages in parallel, preserving page order | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
results = list(executor.map(extract_page_text, pages)) | |
# Keep only non-empty page results, preserving order | |
doc.close() | |
return [r for r in results if r] | |
def parse_pdf_from_file_multithreaded(file_path, max_workers=None): | |
if max_workers is None: | |
max_workers = os.cpu_count() or 8 | |
try: | |
doc = fitz.open(file_path) | |
pages = [page for page in doc] | |
chunks = [None] * len(pages) | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
results = list(executor.map(extract_page_text, pages)) | |
doc.close() | |
return [r for r in results if r] | |
except Exception as e: | |
raise Exception(f"Error parsing PDF file {file_path}: {str(e)}") | |