Spaces:
Running
Running
import fitz # PyMuPDF | |
import requests | |
from io import BytesIO | |
import time | |
def parse_pdf_from_url(url): | |
start_time = time.time() | |
print(f"Starting PDF download and parsing from URL...") | |
download_start = time.time() | |
res = requests.get(url) | |
download_time = time.time() - download_start | |
print(f"PDF Download took: {download_time:.2f} seconds") | |
parse_start = time.time() | |
doc = fitz.open(stream=BytesIO(res.content), filetype="pdf") | |
chunks = [] | |
for page in doc: | |
text = page.get_text() | |
if text.strip(): | |
chunks.append(text) | |
doc.close() | |
parse_time = time.time() - parse_start | |
print(f"PDF Text Extraction took: {parse_time:.2f} seconds") | |
total_time = time.time() - start_time | |
print(f"Total PDF parsing from URL took: {total_time:.2f} seconds") | |
return chunks | |
def parse_pdf_from_file(file_path): | |
"""Parse a local PDF file and extract text chunks""" | |
start_time = time.time() | |
print(f"Starting PDF parsing from local file: {file_path}") | |
try: | |
doc = fitz.open(file_path) | |
chunks = [] | |
for page in doc: | |
text = page.get_text() | |
if text.strip(): | |
chunks.append(text) | |
doc.close() | |
total_time = time.time() - start_time | |
print(f"Total PDF parsing from file took: {total_time:.2f} seconds") | |
return chunks | |
except Exception as e: | |
total_time = time.time() - start_time | |
print(f"Error parsing PDF file after {total_time:.2f} seconds: {str(e)}") | |
raise Exception(f"Error parsing PDF file {file_path}: {str(e)}") | |