File size: 4,185 Bytes
9c6594c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# SPDX-License-Identifier: Apache-2.0
# Copyright 2023 The HuggingFace Authors.
from typing import Any, Optional, Union

from huggingface_hub import HfFileSystem

from . import config
from .table import CastError
from .utils.track import TrackedIterableFromGenerator, tracked_list, tracked_str


class DatasetsError(Exception):
    """Base class for exceptions in this library."""


class DefunctDatasetError(DatasetsError):
    """The dataset has been defunct."""


class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):
    """FileNotFoundError raised by this library."""


class DataFilesNotFoundError(FileNotFoundDatasetsError):
    """No (supported) data files found."""


class DatasetNotFoundError(FileNotFoundDatasetsError):
    """Dataset not found.

    Raised when trying to access:
    - a missing dataset, or
    - a private/gated dataset and the user is not authenticated.
    """


class DatasetBuildError(DatasetsError):
    pass


class ManualDownloadError(DatasetBuildError):
    pass


class FileFormatError(DatasetBuildError):
    pass


class DatasetGenerationError(DatasetBuildError):
    pass


class DatasetGenerationCastError(DatasetGenerationError):
    @classmethod
    def from_cast_error(
        cls,
        cast_error: CastError,
        builder_name: str,
        gen_kwargs: dict[str, Any],
        token: Optional[Union[bool, str]],
    ) -> "DatasetGenerationCastError":
        explanation_message = (
            f"\n\nAll the data files must have the same columns, but at some point {cast_error.details()}"
        )
        formatted_tracked_gen_kwargs: list[str] = []
        for gen_kwarg in gen_kwargs.values():
            if not isinstance(gen_kwarg, (tracked_str, tracked_list, TrackedIterableFromGenerator)):
                continue
            while (
                isinstance(gen_kwarg, (tracked_list, TrackedIterableFromGenerator)) and gen_kwarg.last_item is not None
            ):
                gen_kwarg = gen_kwarg.last_item
            if isinstance(gen_kwarg, tracked_str):
                gen_kwarg = gen_kwarg.get_origin()
            if isinstance(gen_kwarg, str) and gen_kwarg.startswith("hf://"):
                resolved_path = HfFileSystem(endpoint=config.HF_ENDPOINT, token=token).resolve_path(gen_kwarg)
                gen_kwarg = "hf://" + resolved_path.unresolve()
                if "@" + resolved_path.revision in gen_kwarg:
                    gen_kwarg = (
                        gen_kwarg.replace("@" + resolved_path.revision, "", 1)
                        + f" (at revision {resolved_path.revision})"
                    )
            formatted_tracked_gen_kwargs.append(str(gen_kwarg))
        if formatted_tracked_gen_kwargs:
            explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}"
        help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)"
        return cls("An error occurred while generating the dataset" + explanation_message + help_message)


class ChecksumVerificationError(DatasetsError):
    """Error raised during checksums verifications of downloaded files."""


class UnexpectedDownloadedFileError(ChecksumVerificationError):
    """Some downloaded files were not expected."""


class ExpectedMoreDownloadedFilesError(ChecksumVerificationError):
    """Some files were supposed to be downloaded but were not."""


class NonMatchingChecksumError(ChecksumVerificationError):
    """The downloaded file checksum don't match the expected checksum."""


class SplitsVerificationError(DatasetsError):
    """Error raised during splits verifications."""


class UnexpectedSplitsError(SplitsVerificationError):
    """The expected splits of the downloaded file is missing."""


class ExpectedMoreSplitsError(SplitsVerificationError):
    """Some recorded splits are missing."""


class NonMatchingSplitsSizesError(SplitsVerificationError):
    """The splits sizes don't match the expected splits sizes."""