// Licensed to the Apache Software Foundation (ASF) under one | |
// or more contributor license agreements. See the NOTICE file | |
// distributed with this work for additional information | |
// regarding copyright ownership. The ASF licenses this file | |
// to you under the Apache License, Version 2.0 (the | |
// "License"); you may not use this file except in compliance | |
// with the License. You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, | |
// software distributed under the License is distributed on an | |
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
// KIND, either express or implied. See the License for the | |
// specific language governing permissions and limitations | |
// under the License. | |
namespace parquet { | |
class ColumnReader; | |
class FileMetaData; | |
class PageIndexReader; | |
class BloomFilterReader; | |
class PageReader; | |
class RowGroupMetaData; | |
namespace internal { | |
class RecordReader; | |
} | |
class PARQUET_EXPORT RowGroupReader { | |
public: | |
// Forward declare a virtual class 'Contents' to aid dependency injection and more | |
// easily create test fixtures | |
// An implementation of the Contents class is defined in the .cc file | |
struct Contents { | |
virtual ~Contents() {} | |
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0; | |
virtual const RowGroupMetaData* metadata() const = 0; | |
virtual const ReaderProperties* properties() const = 0; | |
}; | |
explicit RowGroupReader(std::unique_ptr<Contents> contents); | |
// Returns the rowgroup metadata | |
const RowGroupMetaData* metadata() const; | |
// Construct a ColumnReader for the indicated row group-relative | |
// column. Ownership is shared with the RowGroupReader. | |
std::shared_ptr<ColumnReader> Column(int i); | |
// EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group. | |
// Ownership is shared with the RowGroupReader. | |
std::shared_ptr<internal::RecordReader> RecordReader(int i, | |
bool read_dictionary = false); | |
// Construct a ColumnReader, trying to enable exposed encoding. | |
// | |
// For dictionary encoding, currently we only support column chunks that are fully | |
// dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded. | |
// If a column chunk uses dictionary encoding but then falls back to plain encoding, the | |
// encoding will not be exposed. | |
// | |
// The returned column reader provides an API GetExposedEncoding() for the | |
// users to check the exposed encoding and determine how to read the batches. | |
// | |
// \note API EXPERIMENTAL | |
std::shared_ptr<ColumnReader> ColumnWithExposeEncoding( | |
int i, ExposedEncoding encoding_to_expose); | |
// Construct a RecordReader, trying to enable exposed encoding. | |
// | |
// For dictionary encoding, currently we only support column chunks that are | |
// fully dictionary encoded byte arrays. The caller should verify if the reader can read | |
// and expose the dictionary by checking the reader's read_dictionary(). If a column | |
// chunk uses dictionary encoding but then falls back to plain encoding, the returned | |
// reader will read decoded data without exposing the dictionary. | |
// | |
// \note API EXPERIMENTAL | |
std::shared_ptr<internal::RecordReader> RecordReaderWithExposeEncoding( | |
int i, ExposedEncoding encoding_to_expose); | |
std::unique_ptr<PageReader> GetColumnPageReader(int i); | |
private: | |
// Holds a pointer to an instance of Contents implementation | |
std::unique_ptr<Contents> contents_; | |
}; | |
class PARQUET_EXPORT ParquetFileReader { | |
public: | |
// Declare a virtual class 'Contents' to aid dependency injection and more | |
// easily create test fixtures | |
// An implementation of the Contents class is defined in the .cc file | |
struct PARQUET_EXPORT Contents { | |
static std::unique_ptr<Contents> Open( | |
std::shared_ptr<::arrow::io::RandomAccessFile> source, | |
const ReaderProperties& props = default_reader_properties(), | |
std::shared_ptr<FileMetaData> metadata = NULLPTR); | |
static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync( | |
std::shared_ptr<::arrow::io::RandomAccessFile> source, | |
const ReaderProperties& props = default_reader_properties(), | |
std::shared_ptr<FileMetaData> metadata = NULLPTR); | |
virtual ~Contents() = default; | |
// Perform any cleanup associated with the file contents | |
virtual void Close() = 0; | |
virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0; | |
virtual std::shared_ptr<FileMetaData> metadata() const = 0; | |
virtual std::shared_ptr<PageIndexReader> GetPageIndexReader() = 0; | |
virtual BloomFilterReader& GetBloomFilterReader() = 0; | |
}; | |
ParquetFileReader(); | |
~ParquetFileReader(); | |
// Create a file reader instance from an Arrow file object. Thread-safety is | |
// the responsibility of the file implementation | |
static std::unique_ptr<ParquetFileReader> Open( | |
std::shared_ptr<::arrow::io::RandomAccessFile> source, | |
const ReaderProperties& props = default_reader_properties(), | |
std::shared_ptr<FileMetaData> metadata = NULLPTR); | |
// API Convenience to open a serialized Parquet file on disk, using Arrow IO | |
// interfaces. | |
static std::unique_ptr<ParquetFileReader> OpenFile( | |
const std::string& path, bool memory_map = false, | |
const ReaderProperties& props = default_reader_properties(), | |
std::shared_ptr<FileMetaData> metadata = NULLPTR); | |
// Asynchronously open a file reader from an Arrow file object. | |
// Does not throw - all errors are reported through the Future. | |
static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync( | |
std::shared_ptr<::arrow::io::RandomAccessFile> source, | |
const ReaderProperties& props = default_reader_properties(), | |
std::shared_ptr<FileMetaData> metadata = NULLPTR); | |
void Open(std::unique_ptr<Contents> contents); | |
void Close(); | |
// The RowGroupReader is owned by the FileReader | |
std::shared_ptr<RowGroupReader> RowGroup(int i); | |
// Returns the file metadata. Only one instance is ever created | |
std::shared_ptr<FileMetaData> metadata() const; | |
/// Returns the PageIndexReader. Only one instance is ever created. | |
/// | |
/// If the file does not have the page index, nullptr may be returned. | |
/// Because it pays to check existence of page index in the file, it | |
/// is possible to return a non null value even if page index does | |
/// not exist. It is the caller's responsibility to check the return | |
/// value and follow-up calls to PageIndexReader. | |
/// | |
/// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader. | |
/// Initialize GetPageIndexReader() is not thread-safety. | |
std::shared_ptr<PageIndexReader> GetPageIndexReader(); | |
/// Returns the BloomFilterReader. Only one instance is ever created. | |
/// | |
/// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader. | |
/// Initialize GetBloomFilterReader() is not thread-safety. | |
BloomFilterReader& GetBloomFilterReader(); | |
/// Pre-buffer the specified column indices in all row groups. | |
/// | |
/// Readers can optionally call this to cache the necessary slices | |
/// of the file in-memory before deserialization. Arrow readers can | |
/// automatically do this via an option. This is intended to | |
/// increase performance when reading from high-latency filesystems | |
/// (e.g. Amazon S3). | |
/// | |
/// After calling this, creating readers for row groups/column | |
/// indices that were not buffered may fail. Creating multiple | |
/// readers for the a subset of the buffered regions is | |
/// acceptable. This may be called again to buffer a different set | |
/// of row groups/columns. | |
/// | |
/// If memory usage is a concern, note that data will remain | |
/// buffered in memory until either \a PreBuffer() is called again, | |
/// or the reader itself is destructed. Reading - and buffering - | |
/// only one row group at a time may be useful. | |
/// | |
/// This method may throw. | |
void PreBuffer(const std::vector<int>& row_groups, | |
const std::vector<int>& column_indices, | |
const ::arrow::io::IOContext& ctx, | |
const ::arrow::io::CacheOptions& options); | |
/// Retrieve the list of byte ranges that would need to be read to retrieve | |
/// the data for the specified row groups and column indices. | |
/// | |
/// A reader can optionally call this if they wish to handle their own | |
/// caching and management of file reads (or offload them to other readers). | |
/// Unlike PreBuffer, this method will not perform any actual caching or | |
/// reads, instead just using the file metadata to determine the byte ranges | |
/// that would need to be read if you were to consume the entirety of the column | |
/// chunks for the provided columns in the specified row groups. | |
/// | |
/// If row_groups or column_indices are empty, then the result of this will be empty. | |
/// | |
/// hole_size_limit represents the maximum distance, in bytes, between two | |
/// consecutive ranges; beyond this value, ranges will not be combined. The default | |
/// value is 1MB. | |
/// | |
/// range_size_limit is the maximum size in bytes of a combined range; if combining | |
/// two consecutive ranges would produce a range larger than this, they are not | |
/// combined. The default values is 64MB. This *must* be larger than hole_size_limit. | |
/// | |
/// This will not take into account page indexes or any other predicate push down | |
/// benefits that may be available. | |
::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges( | |
const std::vector<int>& row_groups, const std::vector<int>& column_indices, | |
int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024); | |
/// Wait for the specified row groups and column indices to be pre-buffered. | |
/// | |
/// After the returned Future completes, reading the specified row | |
/// groups/columns will not block. | |
/// | |
/// PreBuffer must be called first. This method does not throw. | |
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups, | |
const std::vector<int>& column_indices) const; | |
private: | |
// Holds a pointer to an instance of Contents implementation | |
std::unique_ptr<Contents> contents_; | |
}; | |
// Read only Parquet file metadata | |
std::shared_ptr<FileMetaData> PARQUET_EXPORT | |
ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); | |
/// \brief Scan all values in file. Useful for performance testing | |
/// \param[in] columns the column numbers to scan. If empty scans all | |
/// \param[in] column_batch_size number of values to read at a time when scanning column | |
/// \param[in] reader a ParquetFileReader instance | |
/// \return number of semantic rows in file | |
PARQUET_EXPORT | |
int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size, | |
ParquetFileReader* reader); | |
} // namespace parquet | |