|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""List and inspect datasets.""" |
|
|
|
import os |
|
from collections.abc import Mapping, Sequence |
|
from typing import Optional, Union |
|
|
|
from .download.download_config import DownloadConfig |
|
from .download.download_manager import DownloadMode |
|
from .download.streaming_download_manager import StreamingDownloadManager |
|
from .info import DatasetInfo |
|
from .load import ( |
|
dataset_module_factory, |
|
get_dataset_builder_class, |
|
load_dataset_builder, |
|
) |
|
from .utils.logging import get_logger |
|
from .utils.version import Version |
|
|
|
|
|
logger = get_logger(__name__) |
|
|
|
|
|
class SplitsNotFoundError(ValueError): |
|
pass |
|
|
|
|
|
def get_dataset_infos( |
|
path: str, |
|
data_files: Optional[Union[dict, list, str]] = None, |
|
download_config: Optional[DownloadConfig] = None, |
|
download_mode: Optional[Union[DownloadMode, str]] = None, |
|
revision: Optional[Union[str, Version]] = None, |
|
token: Optional[Union[bool, str]] = None, |
|
**config_kwargs, |
|
): |
|
"""Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict. |
|
|
|
Args: |
|
path (`str`): path to the dataset processing script with the dataset builder. Can be either: |
|
|
|
- a local path to processing script or the directory containing the script (if the script has the same name as the directory), |
|
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'` |
|
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
|
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` |
|
revision (`Union[str, datasets.Version]`, *optional*): |
|
If specified, the dataset module will be loaded from the datasets repository at this version. |
|
By default: |
|
- it is set to the local version of the lib. |
|
- it will also try to load it from the main branch if it's not available at the local version of the lib. |
|
Specifying a version that is different from your local version of the lib might cause compatibility issues. |
|
download_config ([`DownloadConfig`], *optional*): |
|
Specific download configuration parameters. |
|
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): |
|
Download/generate mode. |
|
data_files (`Union[Dict, List, str]`, *optional*): |
|
Defining the data_files of the dataset configuration. |
|
token (`str` or `bool`, *optional*): |
|
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. |
|
If `True`, or not specified, will get token from `"~/.huggingface"`. |
|
**config_kwargs (additional keyword arguments): |
|
Optional attributes for builder class which will override the attributes if supplied. |
|
|
|
Example: |
|
|
|
```py |
|
>>> from datasets import get_dataset_infos |
|
>>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes') |
|
{'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews...), ...} |
|
``` |
|
""" |
|
config_names = get_dataset_config_names( |
|
path=path, |
|
revision=revision, |
|
download_config=download_config, |
|
download_mode=download_mode, |
|
data_files=data_files, |
|
token=token, |
|
) |
|
return { |
|
config_name: get_dataset_config_info( |
|
path=path, |
|
config_name=config_name, |
|
data_files=data_files, |
|
download_config=download_config, |
|
download_mode=download_mode, |
|
revision=revision, |
|
token=token, |
|
**config_kwargs, |
|
) |
|
for config_name in config_names |
|
} |
|
|
|
|
|
def get_dataset_config_names( |
|
path: str, |
|
revision: Optional[Union[str, Version]] = None, |
|
download_config: Optional[DownloadConfig] = None, |
|
download_mode: Optional[Union[DownloadMode, str]] = None, |
|
dynamic_modules_path: Optional[str] = None, |
|
data_files: Optional[Union[dict, list, str]] = None, |
|
**download_kwargs, |
|
): |
|
"""Get the list of available config names for a particular dataset. |
|
|
|
Args: |
|
path (`str`): path to the dataset processing script with the dataset builder. Can be either: |
|
|
|
- a local path to processing script or the directory containing the script (if the script has the same name as the directory), |
|
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'` |
|
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
|
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'` |
|
revision (`Union[str, datasets.Version]`, *optional*): |
|
If specified, the dataset module will be loaded from the datasets repository at this version. |
|
By default: |
|
- it is set to the local version of the lib. |
|
- it will also try to load it from the main branch if it's not available at the local version of the lib. |
|
Specifying a version that is different from your local version of the lib might cause compatibility issues. |
|
download_config ([`DownloadConfig`], *optional*): |
|
Specific download configuration parameters. |
|
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): |
|
Download/generate mode. |
|
dynamic_modules_path (`str`, defaults to `~/.cache/huggingface/modules/datasets_modules`): |
|
Optional path to the directory in which the dynamic modules are saved. It must have been initialized with `init_dynamic_modules`. |
|
By default the datasets are stored inside the `datasets_modules` module. |
|
data_files (`Union[Dict, List, str]`, *optional*): |
|
Defining the data_files of the dataset configuration. |
|
**download_kwargs (additional keyword arguments): |
|
Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied, |
|
for example `token`. |
|
|
|
Example: |
|
|
|
```py |
|
>>> from datasets import get_dataset_config_names |
|
>>> get_dataset_config_names("nyu-mll/glue") |
|
['cola', |
|
'sst2', |
|
'mrpc', |
|
'qqp', |
|
'stsb', |
|
'mnli', |
|
'mnli_mismatched', |
|
'mnli_matched', |
|
'qnli', |
|
'rte', |
|
'wnli', |
|
'ax'] |
|
``` |
|
""" |
|
dataset_module = dataset_module_factory( |
|
path, |
|
revision=revision, |
|
download_config=download_config, |
|
download_mode=download_mode, |
|
dynamic_modules_path=dynamic_modules_path, |
|
data_files=data_files, |
|
**download_kwargs, |
|
) |
|
builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path)) |
|
return list(builder_cls.builder_configs.keys()) or [ |
|
dataset_module.builder_kwargs.get("config_name", builder_cls.DEFAULT_CONFIG_NAME or "default") |
|
] |
|
|
|
|
|
def get_dataset_default_config_name( |
|
path: str, |
|
revision: Optional[Union[str, Version]] = None, |
|
download_config: Optional[DownloadConfig] = None, |
|
download_mode: Optional[Union[DownloadMode, str]] = None, |
|
dynamic_modules_path: Optional[str] = None, |
|
data_files: Optional[Union[dict, list, str]] = None, |
|
**download_kwargs, |
|
) -> Optional[str]: |
|
"""Get the default config name for a particular dataset. |
|
Can return None only if the dataset has multiple configurations and no default configuration. |
|
|
|
Args: |
|
path (`str`): path to the dataset processing script with the dataset builder. Can be either: |
|
|
|
- a local path to processing script or the directory containing the script (if the script has the same name as the directory), |
|
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'` |
|
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
|
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'` |
|
revision (`Union[str, datasets.Version]`, *optional*): |
|
If specified, the dataset module will be loaded from the datasets repository at this version. |
|
By default: |
|
- it is set to the local version of the lib. |
|
- it will also try to load it from the main branch if it's not available at the local version of the lib. |
|
Specifying a version that is different from your local version of the lib might cause compatibility issues. |
|
download_config ([`DownloadConfig`], *optional*): |
|
Specific download configuration parameters. |
|
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): |
|
Download/generate mode. |
|
dynamic_modules_path (`str`, defaults to `~/.cache/huggingface/modules/datasets_modules`): |
|
Optional path to the directory in which the dynamic modules are saved. It must have been initialized with `init_dynamic_modules`. |
|
By default the datasets are stored inside the `datasets_modules` module. |
|
data_files (`Union[Dict, List, str]`, *optional*): |
|
Defining the data_files of the dataset configuration. |
|
**download_kwargs (additional keyword arguments): |
|
Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied, |
|
for example `token`. |
|
|
|
Returns: |
|
Optional[str]: the default config name if there is one |
|
|
|
Example: |
|
|
|
```py |
|
>>> from datasets import get_dataset_default_config_name |
|
>>> get_dataset_default_config_name("openbookqa") |
|
'main' |
|
``` |
|
""" |
|
dataset_module = dataset_module_factory( |
|
path, |
|
revision=revision, |
|
download_config=download_config, |
|
download_mode=download_mode, |
|
dynamic_modules_path=dynamic_modules_path, |
|
data_files=data_files, |
|
**download_kwargs, |
|
) |
|
builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path)) |
|
builder_configs = list(builder_cls.builder_configs.keys()) |
|
if builder_configs: |
|
default_config_name = builder_configs[0] if len(builder_configs) == 1 else None |
|
else: |
|
default_config_name = "default" |
|
return builder_cls.DEFAULT_CONFIG_NAME or default_config_name |
|
|
|
|
|
def get_dataset_config_info( |
|
path: str, |
|
config_name: Optional[str] = None, |
|
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, |
|
download_config: Optional[DownloadConfig] = None, |
|
download_mode: Optional[Union[DownloadMode, str]] = None, |
|
revision: Optional[Union[str, Version]] = None, |
|
token: Optional[Union[bool, str]] = None, |
|
**config_kwargs, |
|
) -> DatasetInfo: |
|
"""Get the meta information (DatasetInfo) about a dataset for a particular config |
|
|
|
Args: |
|
path (``str``): path to the dataset processing script with the dataset builder. Can be either: |
|
|
|
- a local path to processing script or the directory containing the script (if the script has the same name as the directory), |
|
e.g. ``'./dataset/squad'`` or ``'./dataset/squad/squad.py'`` |
|
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
|
e.g. ``'rajpurkar/squad'``, ``'nyu-mll/glue'`` or ``'openai/webtext'`` |
|
config_name (:obj:`str`, optional): Defining the name of the dataset configuration. |
|
data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s). |
|
download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters. |
|
download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode. |
|
revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load. |
|
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. |
|
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. |
|
token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. |
|
If True, or not specified, will get token from `"~/.huggingface"`. |
|
**config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied. |
|
|
|
""" |
|
builder = load_dataset_builder( |
|
path, |
|
name=config_name, |
|
data_files=data_files, |
|
download_config=download_config, |
|
download_mode=download_mode, |
|
revision=revision, |
|
token=token, |
|
**config_kwargs, |
|
) |
|
info = builder.info |
|
if info.splits is None: |
|
download_config = download_config.copy() if download_config else DownloadConfig() |
|
if token is not None: |
|
download_config.token = token |
|
builder._check_manual_download( |
|
StreamingDownloadManager(base_path=builder.base_path, download_config=download_config) |
|
) |
|
try: |
|
info.splits = { |
|
split_generator.name: {"name": split_generator.name, "dataset_name": path} |
|
for split_generator in builder._split_generators( |
|
StreamingDownloadManager(base_path=builder.base_path, download_config=download_config) |
|
) |
|
} |
|
except Exception as err: |
|
raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err |
|
return info |
|
|
|
|
|
def get_dataset_split_names( |
|
path: str, |
|
config_name: Optional[str] = None, |
|
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, |
|
download_config: Optional[DownloadConfig] = None, |
|
download_mode: Optional[Union[DownloadMode, str]] = None, |
|
revision: Optional[Union[str, Version]] = None, |
|
token: Optional[Union[bool, str]] = None, |
|
**config_kwargs, |
|
): |
|
"""Get the list of available splits for a particular config and dataset. |
|
|
|
Args: |
|
path (`str`): path to the dataset processing script with the dataset builder. Can be either: |
|
|
|
- a local path to processing script or the directory containing the script (if the script has the same name as the directory), |
|
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'` |
|
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), |
|
e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'` |
|
config_name (`str`, *optional*): |
|
Defining the name of the dataset configuration. |
|
data_files (`str` or `Sequence` or `Mapping`, *optional*): |
|
Path(s) to source data file(s). |
|
download_config ([`DownloadConfig`], *optional*): |
|
Specific download configuration parameters. |
|
download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): |
|
Download/generate mode. |
|
revision ([`Version`] or `str`, *optional*): |
|
Version of the dataset script to load. |
|
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. |
|
You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. |
|
token (`str` or `bool`, *optional*): |
|
Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. |
|
If `True`, or not specified, will get token from `"~/.huggingface"`. |
|
**config_kwargs (additional keyword arguments): |
|
Optional attributes for builder class which will override the attributes if supplied. |
|
|
|
Example: |
|
|
|
```py |
|
>>> from datasets import get_dataset_split_names |
|
>>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes') |
|
['train', 'validation', 'test'] |
|
``` |
|
""" |
|
info = get_dataset_config_info( |
|
path, |
|
config_name=config_name, |
|
data_files=data_files, |
|
download_config=download_config, |
|
download_mode=download_mode, |
|
revision=revision, |
|
token=token, |
|
**config_kwargs, |
|
) |
|
return list(info.splits.keys()) |
|
|