from typing import Optional from .. import Features, NamedSplit from ..packaged_modules.text.text import Text from ..utils.typing import NestedDataStructureLike, PathLike from .abc import AbstractDatasetReader class TextDatasetReader(AbstractDatasetReader): def __init__( self, path_or_paths: NestedDataStructureLike[PathLike], split: Optional[NamedSplit] = None, features: Optional[Features] = None, cache_dir: str = None, keep_in_memory: bool = False, streaming: bool = False, num_proc: Optional[int] = None, **kwargs, ): super().__init__( path_or_paths, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, streaming=streaming, num_proc=num_proc, **kwargs, ) path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths} self.builder = Text( cache_dir=cache_dir, data_files=path_or_paths, features=features, **kwargs, ) def read(self): # Build iterable dataset if self.streaming: dataset = self.builder.as_streaming_dataset(split=self.split) # Build regular (map-style) dataset else: download_config = None download_mode = None verification_mode = None base_path = None self.builder.download_and_prepare( download_config=download_config, download_mode=download_mode, verification_mode=verification_mode, base_path=base_path, num_proc=self.num_proc, ) dataset = self.builder.as_dataset( split=self.split, verification_mode=verification_mode, in_memory=self.keep_in_memory ) return dataset