# SPDX-License-Identifier: Apache-2.0 # Copyright 2023 The HuggingFace Authors. from typing import Any, Optional, Union from huggingface_hub import HfFileSystem from . import config from .table import CastError from .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str class DatasetsError(Exception): """Base class for exceptions in this library.""" class DefunctDatasetError(DatasetsError): """The dataset has been defunct.""" class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError): """FileNotFoundError raised by this library.""" class DataFilesNotFoundError(FileNotFoundDatasetsError): """No (supported) data files found.""" class DatasetNotFoundError(FileNotFoundDatasetsError): """Dataset not found. Raised when trying to access: - a missing dataset, or - a private/gated dataset and the user is not authenticated. """ class DatasetBuildError(DatasetsError): pass class ManualDownloadError(DatasetBuildError): pass class FileFormatError(DatasetBuildError): pass class DatasetGenerationError(DatasetBuildError): pass class DatasetGenerationCastError(DatasetGenerationError): @classmethod def from_cast_error( cls, cast_error: CastError, builder_name: str, gen_kwargs: dict[str, Any], token: Optional[Union[bool, str]], ) -> "DatasetGenerationCastError": explanation_message = ( f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}" ) formatted_tracked_gen_kwargs: list[str] = [] for gen_kwarg in gen_kwargs.values(): if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)): continue while ( isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None ): gen_kwarg = gen_kwarg.last_item if isinstance(gen_kwarg, tracked_str): gen_kwarg = gen_kwarg.get_origin() if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"): resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg) gen_kwarg = "hf://" + resolved_path.unresolve() if "@" + resolved_path.revision in gen_kwarg: gen_kwarg = ( gen_kwarg.replace("@" + resolved_path.revision, "", 1) + f" (at revision {resolved_path.revision})" ) formatted_tracked_gen_kwargs.append(str(gen_kwarg)) if formatted_tracked_gen_kwargs: explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}" help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)" return cls("An error occurred while generating the dataset" + explanation_message + help_message) class ChecksumVerificationError(DatasetsError): """Error raised during checksums verifications of downloaded files.""" class UnexpectedDownloadedFileError(ChecksumVerificationError): """Some downloaded files were not expected.""" class ExpectedMoreDownloadedFilesError(ChecksumVerificationError): """Some files were supposed to be downloaded but were not.""" class NonMatchingChecksumError(ChecksumVerificationError): """The downloaded file checksum don't match the expected checksum.""" class SplitsVerificationError(DatasetsError): """Error raised during splits verifications.""" class UnexpectedSplitsError(SplitsVerificationError): """The expected splits of the downloaded file is missing.""" class ExpectedMoreSplitsError(SplitsVerificationError): """Some recorded splits are missing.""" class NonMatchingSplitsSizesError(SplitsVerificationError): """The splits sizes don't match the expected splits sizes."""