File size: 1,638 Bytes
ec96972
 
 
eb87b3b
ec96972
 
eb87b3b
 
 
 
ec96972
eb87b3b
 
 
 
ec96972
 
 
 
 
 
eb87b3b
 
 
 
 
 
ec96972
 
 
 
eb87b3b
 
 
ec96972
 
 
 
 
 
 
 
eb87b3b
 
 
ec96972
 
eb87b3b
 
ec96972
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import fitz  # PyMuPDF
import requests
from io import BytesIO
import time

def parse_pdf_from_url(url):
    start_time = time.time()
    print(f"Starting PDF download and parsing from URL...")
    
    download_start = time.time()
    res = requests.get(url)
    download_time = time.time() - download_start
    print(f"PDF Download took: {download_time:.2f} seconds")
    
    parse_start = time.time()
    doc = fitz.open(stream=BytesIO(res.content), filetype="pdf")
    chunks = []
    for page in doc:
        text = page.get_text()
        if text.strip():
            chunks.append(text)
    doc.close()
    parse_time = time.time() - parse_start
    print(f"PDF Text Extraction took: {parse_time:.2f} seconds")
    
    total_time = time.time() - start_time
    print(f"Total PDF parsing from URL took: {total_time:.2f} seconds")
    return chunks

def parse_pdf_from_file(file_path):
    """Parse a local PDF file and extract text chunks"""
    start_time = time.time()
    print(f"Starting PDF parsing from local file: {file_path}")
    
    try:
        doc = fitz.open(file_path)
        chunks = []
        for page in doc:
            text = page.get_text()
            if text.strip():
                chunks.append(text)
        doc.close()
        
        total_time = time.time() - start_time
        print(f"Total PDF parsing from file took: {total_time:.2f} seconds")
        return chunks
    except Exception as e:
        total_time = time.time() - start_time
        print(f"Error parsing PDF file after {total_time:.2f} seconds: {str(e)}")
        raise Exception(f"Error parsing PDF file {file_path}: {str(e)}")