Spaces:
Running
Running
File size: 781 Bytes
ec96972 eb87b3b ec96972 eb87b3b ec96972 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import fitz # PyMuPDF
import requests
from io import BytesIO
import time
def parse_pdf_from_url(url):
res = requests.get(url)
doc = fitz.open(stream=BytesIO(res.content), filetype="pdf")
chunks = []
for page in doc:
text = page.get_text()
if text.strip():
chunks.append(text)
doc.close()
return chunks
def parse_pdf_from_file(file_path):
"""Parse a local PDF file and extract text chunks"""
try:
doc = fitz.open(file_path)
chunks = []
for page in doc:
text = page.get_text()
if text.strip():
chunks.append(text)
doc.close()
return chunks
except Exception as e:
raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")
|