File size: 781 Bytes
ec96972
 
 
eb87b3b
ec96972
 
 
 
 
 
 
 
 
eb87b3b
ec96972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import fitz  # PyMuPDF
import requests
from io import BytesIO
import time

def parse_pdf_from_url(url):
    res = requests.get(url)
    doc = fitz.open(stream=BytesIO(res.content), filetype="pdf")
    chunks = []
    for page in doc:
        text = page.get_text()
        if text.strip():
            chunks.append(text)
    doc.close()
    return chunks

def parse_pdf_from_file(file_path):
    """Parse a local PDF file and extract text chunks"""
    try:
        doc = fitz.open(file_path)
        chunks = []
        for page in doc:
            text = page.get_text()
            if text.strip():
                chunks.append(text)
        doc.close()
        return chunks
    except Exception as e:
        raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")