File size: 11,185 Bytes
9c6594c |
|
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <string>
#include <vector>
#include "arrow/io/caching.h"
#include "arrow/util/type_fwd.h"
#include "parquet/metadata.h" // IWYU pragma: keep
#include "parquet/platform.h"
#include "parquet/properties.h"
namespace parquet {
class ColumnReader;
class FileMetaData;
class PageIndexReader;
class BloomFilterReader;
class PageReader;
class RowGroupMetaData;
namespace internal {
class RecordReader;
}
class PARQUET_EXPORT RowGroupReader {
public:
// Forward declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
struct Contents {
virtual ~Contents() {}
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
virtual const RowGroupMetaData* metadata() const = 0;
virtual const ReaderProperties* properties() const = 0;
};
explicit RowGroupReader(std::unique_ptr<Contents> contents);
// Returns the rowgroup metadata
const RowGroupMetaData* metadata() const;
// Construct a ColumnReader for the indicated row group-relative
// column. Ownership is shared with the RowGroupReader.
std::shared_ptr<ColumnReader> Column(int i);
// EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group.
// Ownership is shared with the RowGroupReader.
std::shared_ptr<internal::RecordReader> RecordReader(int i,
bool read_dictionary = false);
// Construct a ColumnReader, trying to enable exposed encoding.
//
// For dictionary encoding, currently we only support column chunks that are fully
// dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
// If a column chunk uses dictionary encoding but then falls back to plain encoding, the
// encoding will not be exposed.
//
// The returned column reader provides an API GetExposedEncoding() for the
// users to check the exposed encoding and determine how to read the batches.
//
// \note API EXPERIMENTAL
std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
int i, ExposedEncoding encoding_to_expose);
// Construct a RecordReader, trying to enable exposed encoding.
//
// For dictionary encoding, currently we only support column chunks that are
// fully dictionary encoded byte arrays. The caller should verify if the reader can read
// and expose the dictionary by checking the reader's read_dictionary(). If a column
// chunk uses dictionary encoding but then falls back to plain encoding, the returned
// reader will read decoded data without exposing the dictionary.
//
// \note API EXPERIMENTAL
std::shared_ptr<internal::RecordReader> RecordReaderWithExposeEncoding(
int i, ExposedEncoding encoding_to_expose);
std::unique_ptr<PageReader> GetColumnPageReader(int i);
private:
// Holds a pointer to an instance of Contents implementation
std::unique_ptr<Contents> contents_;
};
class PARQUET_EXPORT ParquetFileReader {
public:
// Declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
struct PARQUET_EXPORT Contents {
static std::unique_ptr<Contents> Open(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
virtual ~Contents() = default;
// Perform any cleanup associated with the file contents
virtual void Close() = 0;
virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
virtual std::shared_ptr<FileMetaData> metadata() const = 0;
virtual std::shared_ptr<PageIndexReader> GetPageIndexReader() = 0;
virtual BloomFilterReader& GetBloomFilterReader() = 0;
};
ParquetFileReader();
~ParquetFileReader();
// Create a file reader instance from an Arrow file object. Thread-safety is
// the responsibility of the file implementation
static std::unique_ptr<ParquetFileReader> Open(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
// API Convenience to open a serialized Parquet file on disk, using Arrow IO
// interfaces.
static std::unique_ptr<ParquetFileReader> OpenFile(
const std::string& path, bool memory_map = false,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
// Asynchronously open a file reader from an Arrow file object.
// Does not throw - all errors are reported through the Future.
static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);
void Open(std::unique_ptr<Contents> contents);
void Close();
// The RowGroupReader is owned by the FileReader
std::shared_ptr<RowGroupReader> RowGroup(int i);
// Returns the file metadata. Only one instance is ever created
std::shared_ptr<FileMetaData> metadata() const;
/// Returns the PageIndexReader. Only one instance is ever created.
///
/// If the file does not have the page index, nullptr may be returned.
/// Because it pays to check existence of page index in the file, it
/// is possible to return a non null value even if page index does
/// not exist. It is the caller's responsibility to check the return
/// value and follow-up calls to PageIndexReader.
///
/// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader.
/// Initialize GetPageIndexReader() is not thread-safety.
std::shared_ptr<PageIndexReader> GetPageIndexReader();
/// Returns the BloomFilterReader. Only one instance is ever created.
///
/// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader.
/// Initialize GetBloomFilterReader() is not thread-safety.
BloomFilterReader& GetBloomFilterReader();
/// Pre-buffer the specified column indices in all row groups.
///
/// Readers can optionally call this to cache the necessary slices
/// of the file in-memory before deserialization. Arrow readers can
/// automatically do this via an option. This is intended to
/// increase performance when reading from high-latency filesystems
/// (e.g. Amazon S3).
///
/// After calling this, creating readers for row groups/column
/// indices that were not buffered may fail. Creating multiple
/// readers for the a subset of the buffered regions is
/// acceptable. This may be called again to buffer a different set
/// of row groups/columns.
///
/// If memory usage is a concern, note that data will remain
/// buffered in memory until either \a PreBuffer() is called again,
/// or the reader itself is destructed. Reading - and buffering -
/// only one row group at a time may be useful.
///
/// This method may throw.
void PreBuffer(const std::vector<int>& row_groups,
const std::vector<int>& column_indices,
const ::arrow::io::IOContext& ctx,
const ::arrow::io::CacheOptions& options);
/// Retrieve the list of byte ranges that would need to be read to retrieve
/// the data for the specified row groups and column indices.
///
/// A reader can optionally call this if they wish to handle their own
/// caching and management of file reads (or offload them to other readers).
/// Unlike PreBuffer, this method will not perform any actual caching or
/// reads, instead just using the file metadata to determine the byte ranges
/// that would need to be read if you were to consume the entirety of the column
/// chunks for the provided columns in the specified row groups.
///
/// If row_groups or column_indices are empty, then the result of this will be empty.
///
/// hole_size_limit represents the maximum distance, in bytes, between two
/// consecutive ranges; beyond this value, ranges will not be combined. The default
/// value is 1MB.
///
/// range_size_limit is the maximum size in bytes of a combined range; if combining
/// two consecutive ranges would produce a range larger than this, they are not
/// combined. The default values is 64MB. This *must* be larger than hole_size_limit.
///
/// This will not take into account page indexes or any other predicate push down
/// benefits that may be available.
::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024);
/// Wait for the specified row groups and column indices to be pre-buffered.
///
/// After the returned Future completes, reading the specified row
/// groups/columns will not block.
///
/// PreBuffer must be called first. This method does not throw.
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
const std::vector<int>& column_indices) const;
private:
// Holds a pointer to an instance of Contents implementation
std::unique_ptr<Contents> contents_;
};
// Read only Parquet file metadata
std::shared_ptr<FileMetaData> PARQUET_EXPORT
ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
/// \brief Scan all values in file. Useful for performance testing
/// \param[in] columns the column numbers to scan. If empty scans all
/// \param[in] column_batch_size number of values to read at a time when scanning column
/// \param[in] reader a ParquetFileReader instance
/// \return number of semantic rows in file
PARQUET_EXPORT
int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
ParquetFileReader* reader);
} // namespace parquet
|