import fitz # PyMuPDF import requests from io import BytesIO from concurrent.futures import ThreadPoolExecutor import os def extract_page_text(page): text = page.get_text() return text if text.strip() else None def parse_pdf_from_url_multithreaded(url, max_workers=None): # Automatically detect and use all available CPU cores if max_workers not set if max_workers is None: max_workers = os.cpu_count() or 8 res = requests.get(url) doc = fitz.open(stream=BytesIO(res.content), filetype="pdf") pages = [page for page in doc] chunks = [None] * len(pages) # Process pages in parallel, preserving page order with ThreadPoolExecutor(max_workers=max_workers) as executor: results = list(executor.map(extract_page_text, pages)) # Keep only non-empty page results, preserving order doc.close() return [r for r in results if r] def parse_pdf_from_file_multithreaded(file_path, max_workers=None): if max_workers is None: max_workers = os.cpu_count() or 8 try: doc = fitz.open(file_path) pages = [page for page in doc] chunks = [None] * len(pages) with ThreadPoolExecutor(max_workers=max_workers) as executor: results = list(executor.map(extract_page_text, pages)) doc.close() return [r for r in results if r] except Exception as e: raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")