// Licensed to the Apache Software Foundation (ASF) under one | |
// or more contributor license agreements. See the NOTICE file | |
// distributed with this work for additional information | |
// regarding copyright ownership. The ASF licenses this file | |
// to you under the Apache License, Version 2.0 (the | |
// "License"); you may not use this file except in compliance | |
// with the License. You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, | |
// software distributed under the License is distributed on an | |
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
// KIND, either express or implied. See the License for the | |
// specific language governing permissions and limitations | |
// under the License. | |
/// Logic for automatically determining the structure of multi-file | |
/// dataset with possible partitioning according to available | |
/// partitioning | |
// This API is EXPERIMENTAL. | |
namespace arrow { | |
namespace dataset { | |
/// \defgroup dataset-discovery Discovery API | |
/// | |
/// @{ | |
struct InspectOptions { | |
/// See `fragments` property. | |
static constexpr int kInspectAllFragments = -1; | |
/// Indicate how many fragments should be inspected to infer the unified dataset | |
/// schema. Limiting the number of fragments accessed improves the latency of | |
/// the discovery process when dealing with a high number of fragments and/or | |
/// high latency file systems. | |
/// | |
/// The default value of `1` inspects the schema of the first (in no particular | |
/// order) fragment only. If the dataset has a uniform schema for all fragments, | |
/// this default is the optimal value. In order to inspect all fragments and | |
/// robustly unify their potentially varying schemas, set this option to | |
/// `kInspectAllFragments`. A value of `0` disables inspection of fragments | |
/// altogether so only the partitioning schema will be inspected. | |
int fragments = 1; | |
/// Control how to unify types. By default, types are merged strictly (the | |
/// type must match exactly, except nulls can be merged with other types). | |
Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults(); | |
}; | |
struct FinishOptions { | |
/// Finalize the dataset with this given schema. If the schema is not | |
/// provided, infer the schema via the Inspect, see the `inspect_options` | |
/// property. | |
std::shared_ptr<Schema> schema = NULLPTR; | |
/// If the schema is not provided, it will be discovered by passing the | |
/// following options to `DatasetDiscovery::Inspect`. | |
InspectOptions inspect_options{}; | |
/// Indicate if the given Schema (when specified), should be validated against | |
/// the fragments' schemas. `inspect_options` will control how many fragments | |
/// are checked. | |
bool validate_fragments = false; | |
}; | |
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected | |
/// schema before materializing said Dataset. | |
class ARROW_DS_EXPORT DatasetFactory { | |
public: | |
/// \brief Get the schemas of the Fragments and Partitioning. | |
virtual Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas( | |
InspectOptions options) = 0; | |
/// \brief Get unified schema for the resulting Dataset. | |
Result<std::shared_ptr<Schema>> Inspect(InspectOptions options = {}); | |
/// \brief Create a Dataset | |
Result<std::shared_ptr<Dataset>> Finish(); | |
/// \brief Create a Dataset with the given schema (see \a InspectOptions::schema) | |
Result<std::shared_ptr<Dataset>> Finish(std::shared_ptr<Schema> schema); | |
/// \brief Create a Dataset with the given options | |
virtual Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) = 0; | |
/// \brief Optional root partition for the resulting Dataset. | |
const compute::Expression& root_partition() const { return root_partition_; } | |
/// \brief Set the root partition for the resulting Dataset. | |
Status SetRootPartition(compute::Expression partition) { | |
root_partition_ = std::move(partition); | |
return Status::OK(); | |
} | |
virtual ~DatasetFactory() = default; | |
protected: | |
DatasetFactory(); | |
compute::Expression root_partition_; | |
}; | |
/// @} | |
/// \brief DatasetFactory provides a way to inspect/discover a Dataset's | |
/// expected schema before materialization. | |
/// \ingroup dataset-implementations | |
class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory { | |
public: | |
static Result<std::shared_ptr<DatasetFactory>> Make( | |
std::vector<std::shared_ptr<DatasetFactory>> factories); | |
/// \brief Return the list of child DatasetFactory | |
const std::vector<std::shared_ptr<DatasetFactory>>& factories() const { | |
return factories_; | |
} | |
/// \brief Get the schemas of the Datasets. | |
/// | |
/// Instead of applying options globally, it applies at each child factory. | |
/// This will not respect `options.fragments` exactly, but will respect the | |
/// spirit of peeking the first fragments or all of them. | |
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas( | |
InspectOptions options) override; | |
/// \brief Create a Dataset. | |
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override; | |
protected: | |
explicit UnionDatasetFactory(std::vector<std::shared_ptr<DatasetFactory>> factories); | |
std::vector<std::shared_ptr<DatasetFactory>> factories_; | |
}; | |
/// \ingroup dataset-filesystem | |
struct FileSystemFactoryOptions { | |
/// Either an explicit Partitioning or a PartitioningFactory to discover one. | |
/// | |
/// If a factory is provided, it will be used to infer a schema for partition fields | |
/// based on file and directory paths then construct a Partitioning. The default | |
/// is a Partitioning which will yield no partition information. | |
/// | |
/// The (explicit or discovered) partitioning will be applied to discovered files | |
/// and the resulting partition information embedded in the Dataset. | |
PartitioningOrFactory partitioning{Partitioning::Default()}; | |
/// For the purposes of applying the partitioning, paths will be stripped | |
/// of the partition_base_dir. Files not matching the partition_base_dir | |
/// prefix will be skipped for partition discovery. The ignored files will still | |
/// be part of the Dataset, but will not have partition information. | |
/// | |
/// Example: | |
/// partition_base_dir = "/dataset"; | |
/// | |
/// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning | |
/// | |
/// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery. | |
/// | |
/// This is useful for partitioning which parses directory when ordering | |
/// is important, e.g. DirectoryPartitioning. | |
std::string partition_base_dir; | |
/// Invalid files (via selector or explicitly) will be excluded by checking | |
/// with the FileFormat::IsSupported method. This will incur IO for each files | |
/// in a serial and single threaded fashion. Disabling this feature will skip the | |
/// IO, but unsupported files may be present in the Dataset | |
/// (resulting in an error at scan time). | |
bool exclude_invalid_files = false; | |
/// When discovering from a Selector (and not from an explicit file list), ignore | |
/// files and directories matching any of these prefixes. | |
/// | |
/// Example (with selector = "/dataset/**"): | |
/// selector_ignore_prefixes = {"_", ".DS_STORE" }; | |
/// | |
/// - "/dataset/data.csv" -> not ignored | |
/// - "/dataset/_metadata" -> ignored | |
/// - "/dataset/.DS_STORE" -> ignored | |
/// - "/dataset/_hidden/dat" -> ignored | |
/// - "/dataset/nested/.DS_STORE" -> ignored | |
std::vector<std::string> selector_ignore_prefixes = { | |
".", | |
"_", | |
}; | |
}; | |
/// \brief FileSystemDatasetFactory creates a Dataset from a vector of | |
/// fs::FileInfo or a fs::FileSelector. | |
/// \ingroup dataset-filesystem | |
class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory { | |
public: | |
/// \brief Build a FileSystemDatasetFactory from an explicit list of | |
/// paths. | |
/// | |
/// \param[in] filesystem passed to FileSystemDataset | |
/// \param[in] paths passed to FileSystemDataset | |
/// \param[in] format passed to FileSystemDataset | |
/// \param[in] options see FileSystemFactoryOptions for more information. | |
static Result<std::shared_ptr<DatasetFactory>> Make( | |
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths, | |
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options); | |
/// \brief Build a FileSystemDatasetFactory from a fs::FileSelector. | |
/// | |
/// The selector will expand to a vector of FileInfo. The expansion/crawling | |
/// is performed in this function call. Thus, the finalized Dataset is | |
/// working with a snapshot of the filesystem. | |
// | |
/// If options.partition_base_dir is not provided, it will be overwritten | |
/// with selector.base_dir. | |
/// | |
/// \param[in] filesystem passed to FileSystemDataset | |
/// \param[in] selector used to crawl and search files | |
/// \param[in] format passed to FileSystemDataset | |
/// \param[in] options see FileSystemFactoryOptions for more information. | |
static Result<std::shared_ptr<DatasetFactory>> Make( | |
std::shared_ptr<fs::FileSystem> filesystem, fs::FileSelector selector, | |
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options); | |
/// \brief Build a FileSystemDatasetFactory from an uri including filesystem | |
/// information. | |
/// | |
/// \param[in] uri passed to FileSystemDataset | |
/// \param[in] format passed to FileSystemDataset | |
/// \param[in] options see FileSystemFactoryOptions for more information. | |
static Result<std::shared_ptr<DatasetFactory>> Make(std::string uri, | |
std::shared_ptr<FileFormat> format, | |
FileSystemFactoryOptions options); | |
/// \brief Build a FileSystemDatasetFactory from an explicit list of | |
/// file information. | |
/// | |
/// \param[in] filesystem passed to FileSystemDataset | |
/// \param[in] files passed to FileSystemDataset | |
/// \param[in] format passed to FileSystemDataset | |
/// \param[in] options see FileSystemFactoryOptions for more information. | |
static Result<std::shared_ptr<DatasetFactory>> Make( | |
std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files, | |
std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options); | |
Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas( | |
InspectOptions options) override; | |
Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override; | |
protected: | |
FileSystemDatasetFactory(std::vector<fs::FileInfo> files, | |
std::shared_ptr<fs::FileSystem> filesystem, | |
std::shared_ptr<FileFormat> format, | |
FileSystemFactoryOptions options); | |
Result<std::shared_ptr<Schema>> PartitionSchema(); | |
std::vector<fs::FileInfo> files_; | |
std::shared_ptr<fs::FileSystem> fs_; | |
std::shared_ptr<FileFormat> format_; | |
FileSystemFactoryOptions options_; | |
}; | |
} // namespace dataset | |
} // namespace arrow | |