# Copyright 2020 The HuggingFace Datasets Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Lint as: python3 """List and inspect datasets.""" import os from collections.abc import Mapping, Sequence from typing import Optional, Union from .download.download_config import DownloadConfig from .download.download_manager import DownloadMode from .download.streaming_download_manager import StreamingDownloadManager from .info import DatasetInfo from .load import ( dataset_module_factory, get_dataset_builder_class, load_dataset_builder, ) from .utils.logging import get_logger from .utils.version import Version logger = get_logger(__name__) class SplitsNotFoundError(ValueError): pass def get_dataset_infos( path: str, data_files: Optional[Union[dict, list, str]] = None, download_config: Optional[DownloadConfig] = None, download_mode: Optional[Union[DownloadMode, str]] = None, revision: Optional[Union[str, Version]] = None, token: Optional[Union[bool, str]] = None, **config_kwargs, ): """Get the meta information about a dataset, returned as a dict mapping config name to DatasetInfoDict. Args: path (`str`): path to the dataset processing script with the dataset builder. Can be either: - a local path to processing script or the directory containing the script (if the script has the same name as the directory), e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'` - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or``'openai/webtext'` revision (`Union[str, datasets.Version]`, *optional*): If specified, the dataset module will be loaded from the datasets repository at this version. By default: - it is set to the local version of the lib. - it will also try to load it from the main branch if it's not available at the local version of the lib. Specifying a version that is different from your local version of the lib might cause compatibility issues. download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters. download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): Download/generate mode. data_files (`Union[Dict, List, str]`, *optional*): Defining the data_files of the dataset configuration. token (`str` or `bool`, *optional*): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. If `True`, or not specified, will get token from `"~/.huggingface"`. **config_kwargs (additional keyword arguments): Optional attributes for builder class which will override the attributes if supplied. Example: ```py >>> from datasets import get_dataset_infos >>> get_dataset_infos('cornell-movie-review-data/rotten_tomatoes') {'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews...), ...} ``` """ config_names = get_dataset_config_names( path=path, revision=revision, download_config=download_config, download_mode=download_mode, data_files=data_files, token=token, ) return { config_name: get_dataset_config_info( path=path, config_name=config_name, data_files=data_files, download_config=download_config, download_mode=download_mode, revision=revision, token=token, **config_kwargs, ) for config_name in config_names } def get_dataset_config_names( path: str, revision: Optional[Union[str, Version]] = None, download_config: Optional[DownloadConfig] = None, download_mode: Optional[Union[DownloadMode, str]] = None, dynamic_modules_path: Optional[str] = None, data_files: Optional[Union[dict, list, str]] = None, **download_kwargs, ): """Get the list of available config names for a particular dataset. Args: path (`str`): path to the dataset processing script with the dataset builder. Can be either: - a local path to processing script or the directory containing the script (if the script has the same name as the directory), e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'` - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'` revision (`Union[str, datasets.Version]`, *optional*): If specified, the dataset module will be loaded from the datasets repository at this version. By default: - it is set to the local version of the lib. - it will also try to load it from the main branch if it's not available at the local version of the lib. Specifying a version that is different from your local version of the lib might cause compatibility issues. download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters. download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): Download/generate mode. dynamic_modules_path (`str`, defaults to `~/.cache/huggingface/modules/datasets_modules`): Optional path to the directory in which the dynamic modules are saved. It must have been initialized with `init_dynamic_modules`. By default the datasets are stored inside the `datasets_modules` module. data_files (`Union[Dict, List, str]`, *optional*): Defining the data_files of the dataset configuration. **download_kwargs (additional keyword arguments): Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied, for example `token`. Example: ```py >>> from datasets import get_dataset_config_names >>> get_dataset_config_names("nyu-mll/glue") ['cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', 'qnli', 'rte', 'wnli', 'ax'] ``` """ dataset_module = dataset_module_factory( path, revision=revision, download_config=download_config, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path, data_files=data_files, **download_kwargs, ) builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path)) return list(builder_cls.builder_configs.keys()) or [ dataset_module.builder_kwargs.get("config_name", builder_cls.DEFAULT_CONFIG_NAME or "default") ] def get_dataset_default_config_name( path: str, revision: Optional[Union[str, Version]] = None, download_config: Optional[DownloadConfig] = None, download_mode: Optional[Union[DownloadMode, str]] = None, dynamic_modules_path: Optional[str] = None, data_files: Optional[Union[dict, list, str]] = None, **download_kwargs, ) -> Optional[str]: """Get the default config name for a particular dataset. Can return None only if the dataset has multiple configurations and no default configuration. Args: path (`str`): path to the dataset processing script with the dataset builder. Can be either: - a local path to processing script or the directory containing the script (if the script has the same name as the directory), e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'` - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'` revision (`Union[str, datasets.Version]`, *optional*): If specified, the dataset module will be loaded from the datasets repository at this version. By default: - it is set to the local version of the lib. - it will also try to load it from the main branch if it's not available at the local version of the lib. Specifying a version that is different from your local version of the lib might cause compatibility issues. download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters. download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): Download/generate mode. dynamic_modules_path (`str`, defaults to `~/.cache/huggingface/modules/datasets_modules`): Optional path to the directory in which the dynamic modules are saved. It must have been initialized with `init_dynamic_modules`. By default the datasets are stored inside the `datasets_modules` module. data_files (`Union[Dict, List, str]`, *optional*): Defining the data_files of the dataset configuration. **download_kwargs (additional keyword arguments): Optional attributes for [`DownloadConfig`] which will override the attributes in `download_config` if supplied, for example `token`. Returns: Optional[str]: the default config name if there is one Example: ```py >>> from datasets import get_dataset_default_config_name >>> get_dataset_default_config_name("openbookqa") 'main' ``` """ dataset_module = dataset_module_factory( path, revision=revision, download_config=download_config, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path, data_files=data_files, **download_kwargs, ) builder_cls = get_dataset_builder_class(dataset_module, dataset_name=os.path.basename(path)) builder_configs = list(builder_cls.builder_configs.keys()) if builder_configs: default_config_name = builder_configs[0] if len(builder_configs) == 1 else None else: default_config_name = "default" return builder_cls.DEFAULT_CONFIG_NAME or default_config_name def get_dataset_config_info( path: str, config_name: Optional[str] = None, data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, download_config: Optional[DownloadConfig] = None, download_mode: Optional[Union[DownloadMode, str]] = None, revision: Optional[Union[str, Version]] = None, token: Optional[Union[bool, str]] = None, **config_kwargs, ) -> DatasetInfo: """Get the meta information (DatasetInfo) about a dataset for a particular config Args: path (``str``): path to the dataset processing script with the dataset builder. Can be either: - a local path to processing script or the directory containing the script (if the script has the same name as the directory), e.g. ``'./dataset/squad'`` or ``'./dataset/squad/squad.py'`` - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), e.g. ``'rajpurkar/squad'``, ``'nyu-mll/glue'`` or ``'openai/webtext'`` config_name (:obj:`str`, optional): Defining the name of the dataset configuration. data_files (:obj:`str` or :obj:`Sequence` or :obj:`Mapping`, optional): Path(s) to source data file(s). download_config (:class:`~download.DownloadConfig`, optional): Specific download configuration parameters. download_mode (:class:`DownloadMode` or :obj:`str`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode. revision (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load. As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. token (``str`` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. If True, or not specified, will get token from `"~/.huggingface"`. **config_kwargs (additional keyword arguments): optional attributes for builder class which will override the attributes if supplied. """ builder = load_dataset_builder( path, name=config_name, data_files=data_files, download_config=download_config, download_mode=download_mode, revision=revision, token=token, **config_kwargs, ) info = builder.info if info.splits is None: download_config = download_config.copy() if download_config else DownloadConfig() if token is not None: download_config.token = token builder._check_manual_download( StreamingDownloadManager(base_path=builder.base_path, download_config=download_config) ) try: info.splits = { split_generator.name: {"name": split_generator.name, "dataset_name": path} for split_generator in builder._split_generators( StreamingDownloadManager(base_path=builder.base_path, download_config=download_config) ) } except Exception as err: raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err return info def get_dataset_split_names( path: str, config_name: Optional[str] = None, data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, download_config: Optional[DownloadConfig] = None, download_mode: Optional[Union[DownloadMode, str]] = None, revision: Optional[Union[str, Version]] = None, token: Optional[Union[bool, str]] = None, **config_kwargs, ): """Get the list of available splits for a particular config and dataset. Args: path (`str`): path to the dataset processing script with the dataset builder. Can be either: - a local path to processing script or the directory containing the script (if the script has the same name as the directory), e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'` - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with [`huggingface_hub.list_datasets`]), e.g. `'rajpurkar/squad'`, `'nyu-mll/glue'` or `'openai/webtext'` config_name (`str`, *optional*): Defining the name of the dataset configuration. data_files (`str` or `Sequence` or `Mapping`, *optional*): Path(s) to source data file(s). download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters. download_mode ([`DownloadMode`] or `str`, defaults to `REUSE_DATASET_IF_EXISTS`): Download/generate mode. revision ([`Version`] or `str`, *optional*): Version of the dataset script to load. As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch. You can specify a different version than the default "main" by using a commit SHA or a git tag of the dataset repository. token (`str` or `bool`, *optional*): Optional string or boolean to use as Bearer token for remote files on the Datasets Hub. If `True`, or not specified, will get token from `"~/.huggingface"`. **config_kwargs (additional keyword arguments): Optional attributes for builder class which will override the attributes if supplied. Example: ```py >>> from datasets import get_dataset_split_names >>> get_dataset_split_names('cornell-movie-review-data/rotten_tomatoes') ['train', 'validation', 'test'] ``` """ info = get_dataset_config_info( path, config_name=config_name, data_files=data_files, download_config=download_config, download_mode=download_mode, revision=revision, token=token, **config_kwargs, ) return list(info.splits.keys())