|
import os |
|
from dataclasses import dataclass, field |
|
from io import BytesIO |
|
from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union |
|
|
|
import pyarrow as pa |
|
|
|
from .. import config |
|
from ..download.download_config import DownloadConfig |
|
from ..table import array_cast |
|
from ..utils.file_utils import is_local_path, xopen |
|
from ..utils.py_utils import string_to_dict |
|
|
|
|
|
if TYPE_CHECKING: |
|
import pdfplumber |
|
|
|
from .features import FeatureType |
|
|
|
|
|
def pdf_to_bytes(pdf: "pdfplumber.pdf.PDF") -> bytes: |
|
"""Convert a pdfplumber.pdf.PDF object to bytes.""" |
|
with BytesIO() as buffer: |
|
for page in pdf.pages: |
|
buffer.write(page.pdf.stream) |
|
return buffer.getvalue() |
|
|
|
|
|
@dataclass |
|
class Pdf: |
|
""" |
|
**Experimental.** |
|
Pdf [`Feature`] to read pdf documents from a pdf file. |
|
|
|
Input: The Pdf feature accepts as input: |
|
- A `str`: Absolute path to the pdf file (i.e. random access is allowed). |
|
- A `dict` with the keys: |
|
- `path`: String with relative path of the pdf file in a dataset repository. |
|
- `bytes`: Bytes of the pdf file. |
|
This is useful for archived files with sequential access. |
|
|
|
- A `pdfplumber.pdf.PDF`: pdfplumber pdf object. |
|
|
|
Args: |
|
mode (`str`, *optional*): |
|
The mode to convert the pdf to. If `None`, the native mode of the pdf is used. |
|
decode (`bool`, defaults to `True`): |
|
Whether to decode the pdf data. If `False`, |
|
returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`. |
|
|
|
Examples: |
|
|
|
```py |
|
>>> from datasets import Dataset, Pdf |
|
>>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf()) |
|
>>> ds.features["pdf"] |
|
Pdf(decode=True, id=None) |
|
>>> ds[0]["pdf"] |
|
<pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40> |
|
>>> ds = ds.cast_column("pdf", Pdf(decode=False)) |
|
>>> ds[0]["pdf"] |
|
{'bytes': None, |
|
'path': 'path/to/pdf/file.pdf'} |
|
``` |
|
""" |
|
|
|
decode: bool = True |
|
id: Optional[str] = None |
|
|
|
|
|
dtype: ClassVar[str] = "pdfplumber.pdf.PDF" |
|
pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()}) |
|
_type: str = field(default="Pdf", init=False, repr=False) |
|
|
|
def __call__(self): |
|
return self.pa_type |
|
|
|
def encode_example(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.pdf.PDF"]) -> dict: |
|
"""Encode example into a format for Arrow. |
|
|
|
Args: |
|
value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`): |
|
Data passed as input to Pdf feature. |
|
|
|
Returns: |
|
`dict` with "path" and "bytes" fields |
|
""" |
|
if config.PDFPLUMBER_AVAILABLE: |
|
import pdfplumber |
|
else: |
|
pdfplumber = None |
|
|
|
if isinstance(value, str): |
|
return {"path": value, "bytes": None} |
|
elif isinstance(value, (bytes, bytearray)): |
|
return {"path": None, "bytes": value} |
|
elif pdfplumber is not None and isinstance(value, pdfplumber.pdf.PDF): |
|
|
|
return encode_pdfplumber_pdf(value) |
|
elif value.get("path") is not None and os.path.isfile(value["path"]): |
|
|
|
return {"bytes": None, "path": value.get("path")} |
|
elif value.get("bytes") is not None or value.get("path") is not None: |
|
|
|
return {"bytes": value.get("bytes"), "path": value.get("path")} |
|
else: |
|
raise ValueError( |
|
f"A pdf sample should have one of 'path' or 'bytes' but they are missing or None in {value}." |
|
) |
|
|
|
def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf.PDF": |
|
"""Decode example pdf file into pdf data. |
|
|
|
Args: |
|
value (`str` or `dict`): |
|
A string with the absolute pdf file path, a dictionary with |
|
keys: |
|
|
|
- `path`: String with absolute or relative pdf file path. |
|
- `bytes`: The bytes of the pdf file. |
|
|
|
token_per_repo_id (`dict`, *optional*): |
|
To access and decode pdf files from private repositories on |
|
the Hub, you can pass a dictionary |
|
repo_id (`str`) -> token (`bool` or `str`). |
|
|
|
Returns: |
|
`pdfplumber.pdf.PDF` |
|
""" |
|
if not self.decode: |
|
raise RuntimeError("Decoding is disabled for this feature. Please use Pdf(decode=True) instead.") |
|
|
|
if config.PDFPLUMBER_AVAILABLE: |
|
import pdfplumber |
|
else: |
|
raise ImportError("To support decoding pdfs, please install 'pdfplumber'.") |
|
|
|
if token_per_repo_id is None: |
|
token_per_repo_id = {} |
|
|
|
path, bytes_ = value["path"], value["bytes"] |
|
if bytes_ is None: |
|
if path is None: |
|
raise ValueError(f"A pdf should have one of 'path' or 'bytes' but both are None in {value}.") |
|
else: |
|
if is_local_path(path): |
|
pdf = pdfplumber.open(path) |
|
else: |
|
source_url = path.split("::")[-1] |
|
pattern = ( |
|
config.HUB_DATASETS_URL |
|
if source_url.startswith(config.HF_ENDPOINT) |
|
else config.HUB_DATASETS_HFFS_URL |
|
) |
|
try: |
|
repo_id = string_to_dict(source_url, pattern)["repo_id"] |
|
token = token_per_repo_id.get(repo_id) |
|
except ValueError: |
|
token = None |
|
download_config = DownloadConfig(token=token) |
|
f = xopen(path, "rb", download_config=download_config) |
|
return pdfplumber.open(f) |
|
else: |
|
with pdfplumber.open(BytesIO(bytes_)) as p: |
|
pdf = p |
|
|
|
return pdf |
|
|
|
def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]: |
|
"""If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.""" |
|
from .features import Value |
|
|
|
return ( |
|
self |
|
if self.decode |
|
else { |
|
"bytes": Value("binary"), |
|
"path": Value("string"), |
|
} |
|
) |
|
|
|
def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArray]) -> pa.StructArray: |
|
"""Cast an Arrow array to the Pdf arrow storage type. |
|
The Arrow types that can be converted to the Pdf pyarrow storage type are: |
|
|
|
- `pa.string()` - it must contain the "path" data |
|
- `pa.binary()` - it must contain the image bytes |
|
- `pa.struct({"bytes": pa.binary()})` |
|
- `pa.struct({"path": pa.string()})` |
|
- `pa.struct({"bytes": pa.binary(), "path": pa.string()})` - order doesn't matter |
|
- `pa.list(*)` - it must contain the pdf array data |
|
|
|
Args: |
|
storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`): |
|
PyArrow array to cast. |
|
|
|
Returns: |
|
`pa.StructArray`: Array in the Pdf arrow storage type, that is |
|
`pa.struct({"bytes": pa.binary(), "path": pa.string()})`. |
|
""" |
|
if pa.types.is_string(storage.type): |
|
bytes_array = pa.array([None] * len(storage), type=pa.binary()) |
|
storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null()) |
|
elif pa.types.is_binary(storage.type): |
|
path_array = pa.array([None] * len(storage), type=pa.string()) |
|
storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null()) |
|
elif pa.types.is_struct(storage.type): |
|
if storage.type.get_field_index("bytes") >= 0: |
|
bytes_array = storage.field("bytes") |
|
else: |
|
bytes_array = pa.array([None] * len(storage), type=pa.binary()) |
|
if storage.type.get_field_index("path") >= 0: |
|
path_array = storage.field("path") |
|
else: |
|
path_array = pa.array([None] * len(storage), type=pa.string()) |
|
storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null()) |
|
return array_cast(storage, self.pa_type) |
|
|
|
|
|
def encode_pdfplumber_pdf(pdf: "pdfplumber.pdf.PDF") -> dict: |
|
""" |
|
Encode a pdfplumber.pdf.PDF object into a dictionary. |
|
|
|
If the PDF has an associated file path, returns the path. Otherwise, serializes |
|
the PDF content into bytes. |
|
|
|
Args: |
|
pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object. |
|
|
|
Returns: |
|
dict: A dictionary with "path" or "bytes" field. |
|
""" |
|
if hasattr(pdf, "stream") and hasattr(pdf.stream, "name") and pdf.stream.name: |
|
|
|
return {"path": pdf.stream.name, "bytes": None} |
|
else: |
|
|
|
return {"path": None, "bytes": pdf_to_bytes(pdf)} |
|
|