import fitz # PyMuPDF import requests from io import BytesIO import time def parse_pdf_from_url(url): start_time = time.time() print(f"Starting PDF download and parsing from URL...") download_start = time.time() res = requests.get(url) download_time = time.time() - download_start print(f"PDF Download took: {download_time:.2f} seconds") parse_start = time.time() doc = fitz.open(stream=BytesIO(res.content), filetype="pdf") chunks = [] for page in doc: text = page.get_text() if text.strip(): chunks.append(text) doc.close() parse_time = time.time() - parse_start print(f"PDF Text Extraction took: {parse_time:.2f} seconds") total_time = time.time() - start_time print(f"Total PDF parsing from URL took: {total_time:.2f} seconds") return chunks def parse_pdf_from_file(file_path): """Parse a local PDF file and extract text chunks""" start_time = time.time() print(f"Starting PDF parsing from local file: {file_path}") try: doc = fitz.open(file_path) chunks = [] for page in doc: text = page.get_text() if text.strip(): chunks.append(text) doc.close() total_time = time.time() - start_time print(f"Total PDF parsing from file took: {total_time:.2f} seconds") return chunks except Exception as e: total_time = time.time() - start_time print(f"Error parsing PDF file after {total_time:.2f} seconds: {str(e)}") raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")