|
|
|
|
|
from typing import Any, Optional, Union |
|
|
|
from huggingface_hub import HfFileSystem |
|
|
|
from . import config |
|
from .table import CastError |
|
from .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str |
|
|
|
|
|
class DatasetsError(Exception): |
|
"""Base class for exceptions in this library.""" |
|
|
|
|
|
class DefunctDatasetError(DatasetsError): |
|
"""The dataset has been defunct.""" |
|
|
|
|
|
class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError): |
|
"""FileNotFoundError raised by this library.""" |
|
|
|
|
|
class DataFilesNotFoundError(FileNotFoundDatasetsError): |
|
"""No (supported) data files found.""" |
|
|
|
|
|
class DatasetNotFoundError(FileNotFoundDatasetsError): |
|
"""Dataset not found. |
|
|
|
Raised when trying to access: |
|
- a missing dataset, or |
|
- a private/gated dataset and the user is not authenticated. |
|
""" |
|
|
|
|
|
class DatasetBuildError(DatasetsError): |
|
pass |
|
|
|
|
|
class ManualDownloadError(DatasetBuildError): |
|
pass |
|
|
|
|
|
class FileFormatError(DatasetBuildError): |
|
pass |
|
|
|
|
|
class DatasetGenerationError(DatasetBuildError): |
|
pass |
|
|
|
|
|
class DatasetGenerationCastError(DatasetGenerationError): |
|
@classmethod |
|
def from_cast_error( |
|
cls, |
|
cast_error: CastError, |
|
builder_name: str, |
|
gen_kwargs: dict[str, Any], |
|
token: Optional[Union[bool, str]], |
|
) -> "DatasetGenerationCastError": |
|
explanation_message = ( |
|
f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}" |
|
) |
|
formatted_tracked_gen_kwargs: list[str] = [] |
|
for gen_kwarg in gen_kwargs.values(): |
|
if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)): |
|
continue |
|
while ( |
|
isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None |
|
): |
|
gen_kwarg = gen_kwarg.last_item |
|
if isinstance(gen_kwarg, tracked_str): |
|
gen_kwarg = gen_kwarg.get_origin() |
|
if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"): |
|
resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg) |
|
gen_kwarg = "hf://" + resolved_path.unresolve() |
|
if "@" + resolved_path.revision in gen_kwarg: |
|
gen_kwarg = ( |
|
gen_kwarg.replace("@" + resolved_path.revision, "", 1) |
|
+ f" (at revision {resolved_path.revision})" |
|
) |
|
formatted_tracked_gen_kwargs.append(str(gen_kwarg)) |
|
if formatted_tracked_gen_kwargs: |
|
explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}" |
|
help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)" |
|
return cls("An error occurred while generating the dataset" + explanation_message + help_message) |
|
|
|
|
|
class ChecksumVerificationError(DatasetsError): |
|
"""Error raised during checksums verifications of downloaded files.""" |
|
|
|
|
|
class UnexpectedDownloadedFileError(ChecksumVerificationError): |
|
"""Some downloaded files were not expected.""" |
|
|
|
|
|
class ExpectedMoreDownloadedFilesError(ChecksumVerificationError): |
|
"""Some files were supposed to be downloaded but were not.""" |
|
|
|
|
|
class NonMatchingChecksumError(ChecksumVerificationError): |
|
"""The downloaded file checksum don't match the expected checksum.""" |
|
|
|
|
|
class SplitsVerificationError(DatasetsError): |
|
"""Error raised during splits verifications.""" |
|
|
|
|
|
class UnexpectedSplitsError(SplitsVerificationError): |
|
"""The expected splits of the downloaded file is missing.""" |
|
|
|
|
|
class ExpectedMoreSplitsError(SplitsVerificationError): |
|
"""Some recorded splits are missing.""" |
|
|
|
|
|
class NonMatchingSplitsSizesError(SplitsVerificationError): |
|
"""The splits sizes don't match the expected splits sizes.""" |
|
|