Spaces:
Running
Running
File size: 1,638 Bytes
ec96972 eb87b3b ec96972 eb87b3b ec96972 eb87b3b ec96972 eb87b3b ec96972 eb87b3b ec96972 eb87b3b ec96972 eb87b3b ec96972 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import fitz # PyMuPDF
import requests
from io import BytesIO
import time
def parse_pdf_from_url(url):
start_time = time.time()
print(f"Starting PDF download and parsing from URL...")
download_start = time.time()
res = requests.get(url)
download_time = time.time() - download_start
print(f"PDF Download took: {download_time:.2f} seconds")
parse_start = time.time()
doc = fitz.open(stream=BytesIO(res.content), filetype="pdf")
chunks = []
for page in doc:
text = page.get_text()
if text.strip():
chunks.append(text)
doc.close()
parse_time = time.time() - parse_start
print(f"PDF Text Extraction took: {parse_time:.2f} seconds")
total_time = time.time() - start_time
print(f"Total PDF parsing from URL took: {total_time:.2f} seconds")
return chunks
def parse_pdf_from_file(file_path):
"""Parse a local PDF file and extract text chunks"""
start_time = time.time()
print(f"Starting PDF parsing from local file: {file_path}")
try:
doc = fitz.open(file_path)
chunks = []
for page in doc:
text = page.get_text()
if text.strip():
chunks.append(text)
doc.close()
total_time = time.time() - start_time
print(f"Total PDF parsing from file took: {total_time:.2f} seconds")
return chunks
except Exception as e:
total_time = time.time() - start_time
print(f"Error parsing PDF file after {total_time:.2f} seconds: {str(e)}")
raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")
|