|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#include <memory> |
|
#include <string> |
|
#include <unordered_map> |
|
#include <unordered_set> |
|
#include <utility> |
|
|
|
#include "arrow/io/caching.h" |
|
#include "arrow/type.h" |
|
#include "arrow/util/compression.h" |
|
#include "arrow/util/type_fwd.h" |
|
#include "parquet/encryption/encryption.h" |
|
#include "parquet/exception.h" |
|
#include "parquet/parquet_version.h" |
|
#include "parquet/platform.h" |
|
#include "parquet/schema.h" |
|
#include "parquet/type_fwd.h" |
|
#include "parquet/types.h" |
|
|
|
namespace parquet { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
enum class ParquetDataPageVersion { V1, V2 }; |
|
|
|
|
|
enum class SizeStatisticsLevel : uint8_t { |
|
|
|
None = 0, |
|
|
|
ColumnChunk, |
|
|
|
PageAndColumnChunk |
|
}; |
|
|
|
|
|
constexpr int64_t kDefaultBufferSize = 4096 * 4; |
|
|
|
constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000; |
|
|
|
|
|
|
|
constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000; |
|
|
|
|
|
constexpr int64_t kDefaultFooterReadSize = 64 * 1024; |
|
|
|
class PARQUET_EXPORT ReaderProperties { |
|
public: |
|
explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool()) |
|
: pool_(pool) {} |
|
|
|
MemoryPool* memory_pool() const { return pool_; } |
|
|
|
std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source, |
|
int64_t start, int64_t num_bytes); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; } |
|
|
|
void enable_buffered_stream() { buffered_stream_enabled_ = true; } |
|
|
|
void disable_buffered_stream() { buffered_stream_enabled_ = false; } |
|
|
|
bool read_dense_for_nullable() const { return read_dense_for_nullable_; } |
|
void enable_read_dense_for_nullable() { read_dense_for_nullable_ = true; } |
|
void disable_read_dense_for_nullable() { read_dense_for_nullable_ = false; } |
|
|
|
|
|
int64_t buffer_size() const { return buffer_size_; } |
|
|
|
void set_buffer_size(int64_t size) { buffer_size_ = size; } |
|
|
|
|
|
|
|
|
|
|
|
int32_t thrift_string_size_limit() const { return thrift_string_size_limit_; } |
|
|
|
void set_thrift_string_size_limit(int32_t size) { thrift_string_size_limit_ = size; } |
|
|
|
|
|
|
|
|
|
|
|
int32_t thrift_container_size_limit() const { return thrift_container_size_limit_; } |
|
|
|
void set_thrift_container_size_limit(int32_t size) { |
|
thrift_container_size_limit_ = size; |
|
} |
|
|
|
|
|
void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) { |
|
file_decryption_properties_ = std::move(decryption); |
|
} |
|
|
|
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const { |
|
return file_decryption_properties_; |
|
} |
|
|
|
bool page_checksum_verification() const { return page_checksum_verification_; } |
|
void set_page_checksum_verification(bool check_crc) { |
|
page_checksum_verification_ = check_crc; |
|
} |
|
|
|
|
|
|
|
|
|
void set_footer_read_size(size_t size) { footer_read_size_ = size; } |
|
size_t footer_read_size() const { return footer_read_size_; } |
|
|
|
private: |
|
MemoryPool* pool_; |
|
int64_t buffer_size_ = kDefaultBufferSize; |
|
int32_t thrift_string_size_limit_ = kDefaultThriftStringSizeLimit; |
|
int32_t thrift_container_size_limit_ = kDefaultThriftContainerSizeLimit; |
|
bool buffered_stream_enabled_ = false; |
|
bool page_checksum_verification_ = false; |
|
|
|
bool read_dense_for_nullable_ = false; |
|
size_t footer_read_size_ = kDefaultFooterReadSize; |
|
std::shared_ptr<FileDecryptionProperties> file_decryption_properties_; |
|
}; |
|
|
|
ReaderProperties PARQUET_EXPORT default_reader_properties(); |
|
|
|
static constexpr int64_t kDefaultDataPageSize = 1024 * 1024; |
|
static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true; |
|
static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize; |
|
static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024; |
|
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024; |
|
static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true; |
|
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096; |
|
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN; |
|
static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; |
|
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; |
|
static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true; |
|
static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL = |
|
SizeStatisticsLevel::PageAndColumnChunk; |
|
|
|
class PARQUET_EXPORT ColumnProperties { |
|
public: |
|
ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING, |
|
Compression::type codec = DEFAULT_COMPRESSION_TYPE, |
|
bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, |
|
bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, |
|
size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE, |
|
bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED) |
|
: encoding_(encoding), |
|
codec_(codec), |
|
dictionary_enabled_(dictionary_enabled), |
|
statistics_enabled_(statistics_enabled), |
|
max_stats_size_(max_stats_size), |
|
page_index_enabled_(page_index_enabled) {} |
|
|
|
void set_encoding(Encoding::type encoding) { encoding_ = encoding; } |
|
|
|
void set_compression(Compression::type codec) { codec_ = codec; } |
|
|
|
void set_dictionary_enabled(bool dictionary_enabled) { |
|
dictionary_enabled_ = dictionary_enabled; |
|
} |
|
|
|
void set_statistics_enabled(bool statistics_enabled) { |
|
statistics_enabled_ = statistics_enabled; |
|
} |
|
|
|
void set_max_statistics_size(size_t max_stats_size) { |
|
max_stats_size_ = max_stats_size; |
|
} |
|
|
|
void set_compression_level(int compression_level) { |
|
if (!codec_options_) { |
|
codec_options_ = std::make_shared<CodecOptions>(); |
|
} |
|
codec_options_->compression_level = compression_level; |
|
} |
|
|
|
void set_codec_options(const std::shared_ptr<CodecOptions>& codec_options) { |
|
codec_options_ = codec_options; |
|
} |
|
|
|
void set_page_index_enabled(bool page_index_enabled) { |
|
page_index_enabled_ = page_index_enabled; |
|
} |
|
|
|
Encoding::type encoding() const { return encoding_; } |
|
|
|
Compression::type compression() const { return codec_; } |
|
|
|
bool dictionary_enabled() const { return dictionary_enabled_; } |
|
|
|
bool statistics_enabled() const { return statistics_enabled_; } |
|
|
|
size_t max_statistics_size() const { return max_stats_size_; } |
|
|
|
int compression_level() const { |
|
if (!codec_options_) { |
|
return ::arrow::util::kUseDefaultCompressionLevel; |
|
} |
|
return codec_options_->compression_level; |
|
} |
|
|
|
const std::shared_ptr<CodecOptions>& codec_options() const { return codec_options_; } |
|
|
|
bool page_index_enabled() const { return page_index_enabled_; } |
|
|
|
private: |
|
Encoding::type encoding_; |
|
Compression::type codec_; |
|
bool dictionary_enabled_; |
|
bool statistics_enabled_; |
|
size_t max_stats_size_; |
|
std::shared_ptr<CodecOptions> codec_options_; |
|
bool page_index_enabled_; |
|
}; |
|
|
|
class PARQUET_EXPORT WriterProperties { |
|
public: |
|
class Builder { |
|
public: |
|
Builder() |
|
: pool_(::arrow::default_memory_pool()), |
|
dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT), |
|
write_batch_size_(DEFAULT_WRITE_BATCH_SIZE), |
|
max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH), |
|
pagesize_(kDefaultDataPageSize), |
|
version_(ParquetVersion::PARQUET_2_6), |
|
data_page_version_(ParquetDataPageVersion::V1), |
|
created_by_(DEFAULT_CREATED_BY), |
|
store_decimal_as_integer_(false), |
|
page_checksum_enabled_(false), |
|
size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL) {} |
|
|
|
explicit Builder(const WriterProperties& properties) |
|
: pool_(properties.memory_pool()), |
|
dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()), |
|
write_batch_size_(properties.write_batch_size()), |
|
max_row_group_length_(properties.max_row_group_length()), |
|
pagesize_(properties.data_pagesize()), |
|
version_(properties.version()), |
|
data_page_version_(properties.data_page_version()), |
|
created_by_(properties.created_by()), |
|
store_decimal_as_integer_(properties.store_decimal_as_integer()), |
|
page_checksum_enabled_(properties.page_checksum_enabled()), |
|
size_statistics_level_(properties.size_statistics_level()), |
|
sorting_columns_(properties.sorting_columns()), |
|
default_column_properties_(properties.default_column_properties()) {} |
|
|
|
virtual ~Builder() {} |
|
|
|
|
|
Builder* memory_pool(MemoryPool* pool) { |
|
pool_ = pool; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* enable_dictionary() { |
|
default_column_properties_.set_dictionary_enabled(true); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* disable_dictionary() { |
|
default_column_properties_.set_dictionary_enabled(false); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* enable_dictionary(const std::string& path) { |
|
dictionary_enabled_[path] = true; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) { |
|
return this->enable_dictionary(path->ToDotString()); |
|
} |
|
|
|
|
|
|
|
Builder* disable_dictionary(const std::string& path) { |
|
dictionary_enabled_[path] = false; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) { |
|
return this->disable_dictionary(path->ToDotString()); |
|
} |
|
|
|
|
|
Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) { |
|
dictionary_pagesize_limit_ = dictionary_psize_limit; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* write_batch_size(int64_t write_batch_size) { |
|
write_batch_size_ = write_batch_size; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* max_row_group_length(int64_t max_row_group_length) { |
|
max_row_group_length_ = max_row_group_length; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* data_pagesize(int64_t pg_size) { |
|
pagesize_ = pg_size; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* data_page_version(ParquetDataPageVersion data_page_version) { |
|
data_page_version_ = data_page_version; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* version(ParquetVersion::type version) { |
|
version_ = version; |
|
return this; |
|
} |
|
|
|
Builder* created_by(const std::string& created_by) { |
|
created_by_ = created_by; |
|
return this; |
|
} |
|
|
|
Builder* enable_page_checksum() { |
|
page_checksum_enabled_ = true; |
|
return this; |
|
} |
|
|
|
Builder* disable_page_checksum() { |
|
page_checksum_enabled_ = false; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
Builder* encoding(Encoding::type encoding_type) { |
|
if (encoding_type == Encoding::PLAIN_DICTIONARY || |
|
encoding_type == Encoding::RLE_DICTIONARY) { |
|
throw ParquetException("Can't use dictionary encoding as fallback encoding"); |
|
} |
|
|
|
default_column_properties_.set_encoding(encoding_type); |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
Builder* encoding(const std::string& path, Encoding::type encoding_type) { |
|
if (encoding_type == Encoding::PLAIN_DICTIONARY || |
|
encoding_type == Encoding::RLE_DICTIONARY) { |
|
throw ParquetException("Can't use dictionary encoding as fallback encoding"); |
|
} |
|
|
|
encodings_[path] = encoding_type; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path, |
|
Encoding::type encoding_type) { |
|
return this->encoding(path->ToDotString(), encoding_type); |
|
} |
|
|
|
|
|
|
|
Builder* compression(Compression::type codec) { |
|
default_column_properties_.set_compression(codec); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* max_statistics_size(size_t max_stats_sz) { |
|
default_column_properties_.set_max_statistics_size(max_stats_sz); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* compression(const std::string& path, Compression::type codec) { |
|
codecs_[path] = codec; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* compression(const std::shared_ptr<schema::ColumnPath>& path, |
|
Compression::type codec) { |
|
return this->compression(path->ToDotString(), codec); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* compression_level(int compression_level) { |
|
default_column_properties_.set_compression_level(compression_level); |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* compression_level(const std::string& path, int compression_level) { |
|
if (!codec_options_[path]) { |
|
codec_options_[path] = std::make_shared<CodecOptions>(); |
|
} |
|
codec_options_[path]->compression_level = compression_level; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path, |
|
int compression_level) { |
|
return this->compression_level(path->ToDotString(), compression_level); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* codec_options( |
|
const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { |
|
default_column_properties_.set_codec_options(codec_options); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* codec_options( |
|
const std::string& path, |
|
const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { |
|
codec_options_[path] = codec_options; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* codec_options( |
|
const std::shared_ptr<schema::ColumnPath>& path, |
|
const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { |
|
return this->codec_options(path->ToDotString(), codec_options); |
|
} |
|
|
|
|
|
|
|
Builder* encryption( |
|
std::shared_ptr<FileEncryptionProperties> file_encryption_properties) { |
|
file_encryption_properties_ = std::move(file_encryption_properties); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* enable_statistics() { |
|
default_column_properties_.set_statistics_enabled(true); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* disable_statistics() { |
|
default_column_properties_.set_statistics_enabled(false); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* enable_statistics(const std::string& path) { |
|
statistics_enabled_[path] = true; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) { |
|
return this->enable_statistics(path->ToDotString()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* set_sorting_columns(std::vector<SortingColumn> sorting_columns) { |
|
sorting_columns_ = std::move(sorting_columns); |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* disable_statistics(const std::string& path) { |
|
statistics_enabled_[path] = false; |
|
return this; |
|
} |
|
|
|
|
|
|
|
Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) { |
|
return this->disable_statistics(path->ToDotString()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* enable_store_decimal_as_integer() { |
|
store_decimal_as_integer_ = true; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
Builder* disable_store_decimal_as_integer() { |
|
store_decimal_as_integer_ = false; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* enable_write_page_index() { |
|
default_column_properties_.set_page_index_enabled(true); |
|
return this; |
|
} |
|
|
|
|
|
Builder* disable_write_page_index() { |
|
default_column_properties_.set_page_index_enabled(false); |
|
return this; |
|
} |
|
|
|
|
|
Builder* enable_write_page_index(const std::string& path) { |
|
page_index_enabled_[path] = true; |
|
return this; |
|
} |
|
|
|
|
|
Builder* enable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) { |
|
return this->enable_write_page_index(path->ToDotString()); |
|
} |
|
|
|
|
|
Builder* disable_write_page_index(const std::string& path) { |
|
page_index_enabled_[path] = false; |
|
return this; |
|
} |
|
|
|
|
|
Builder* disable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) { |
|
return this->disable_write_page_index(path->ToDotString()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* set_size_statistics_level(SizeStatisticsLevel level) { |
|
size_statistics_level_ = level; |
|
return this; |
|
} |
|
|
|
|
|
|
|
std::shared_ptr<WriterProperties> build() { |
|
std::unordered_map<std::string, ColumnProperties> column_properties; |
|
auto get = [&](const std::string& key) -> ColumnProperties& { |
|
auto it = column_properties.find(key); |
|
if (it == column_properties.end()) |
|
return column_properties[key] = default_column_properties_; |
|
else |
|
return it->second; |
|
}; |
|
|
|
for (const auto& item : encodings_) get(item.first).set_encoding(item.second); |
|
for (const auto& item : codecs_) get(item.first).set_compression(item.second); |
|
for (const auto& item : codec_options_) |
|
get(item.first).set_codec_options(item.second); |
|
for (const auto& item : dictionary_enabled_) |
|
get(item.first).set_dictionary_enabled(item.second); |
|
for (const auto& item : statistics_enabled_) |
|
get(item.first).set_statistics_enabled(item.second); |
|
for (const auto& item : page_index_enabled_) |
|
get(item.first).set_page_index_enabled(item.second); |
|
|
|
return std::shared_ptr<WriterProperties>(new WriterProperties( |
|
pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, |
|
pagesize_, version_, created_by_, page_checksum_enabled_, |
|
size_statistics_level_, std::move(file_encryption_properties_), |
|
default_column_properties_, column_properties, data_page_version_, |
|
store_decimal_as_integer_, std::move(sorting_columns_))); |
|
} |
|
|
|
private: |
|
MemoryPool* pool_; |
|
int64_t dictionary_pagesize_limit_; |
|
int64_t write_batch_size_; |
|
int64_t max_row_group_length_; |
|
int64_t pagesize_; |
|
ParquetVersion::type version_; |
|
ParquetDataPageVersion data_page_version_; |
|
std::string created_by_; |
|
bool store_decimal_as_integer_; |
|
bool page_checksum_enabled_; |
|
SizeStatisticsLevel size_statistics_level_; |
|
|
|
std::shared_ptr<FileEncryptionProperties> file_encryption_properties_; |
|
|
|
|
|
std::vector<SortingColumn> sorting_columns_; |
|
|
|
|
|
ColumnProperties default_column_properties_; |
|
std::unordered_map<std::string, Encoding::type> encodings_; |
|
std::unordered_map<std::string, Compression::type> codecs_; |
|
std::unordered_map<std::string, std::shared_ptr<CodecOptions>> codec_options_; |
|
std::unordered_map<std::string, bool> dictionary_enabled_; |
|
std::unordered_map<std::string, bool> statistics_enabled_; |
|
std::unordered_map<std::string, bool> page_index_enabled_; |
|
}; |
|
|
|
inline MemoryPool* memory_pool() const { return pool_; } |
|
|
|
inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; } |
|
|
|
inline int64_t write_batch_size() const { return write_batch_size_; } |
|
|
|
inline int64_t max_row_group_length() const { return max_row_group_length_; } |
|
|
|
inline int64_t data_pagesize() const { return pagesize_; } |
|
|
|
inline ParquetDataPageVersion data_page_version() const { |
|
return parquet_data_page_version_; |
|
} |
|
|
|
inline ParquetVersion::type version() const { return parquet_version_; } |
|
|
|
inline std::string created_by() const { return parquet_created_by_; } |
|
|
|
inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; } |
|
|
|
inline bool page_checksum_enabled() const { return page_checksum_enabled_; } |
|
|
|
inline SizeStatisticsLevel size_statistics_level() const { |
|
return size_statistics_level_; |
|
} |
|
|
|
inline Encoding::type dictionary_index_encoding() const { |
|
if (parquet_version_ == ParquetVersion::PARQUET_1_0) { |
|
return Encoding::PLAIN_DICTIONARY; |
|
} else { |
|
return Encoding::RLE_DICTIONARY; |
|
} |
|
} |
|
|
|
inline Encoding::type dictionary_page_encoding() const { |
|
if (parquet_version_ == ParquetVersion::PARQUET_1_0) { |
|
return Encoding::PLAIN_DICTIONARY; |
|
} else { |
|
return Encoding::PLAIN; |
|
} |
|
} |
|
|
|
const ColumnProperties& column_properties( |
|
const std::shared_ptr<schema::ColumnPath>& path) const { |
|
auto it = column_properties_.find(path->ToDotString()); |
|
if (it != column_properties_.end()) return it->second; |
|
return default_column_properties_; |
|
} |
|
|
|
Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const { |
|
return column_properties(path).encoding(); |
|
} |
|
|
|
Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const { |
|
return column_properties(path).compression(); |
|
} |
|
|
|
int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const { |
|
return column_properties(path).compression_level(); |
|
} |
|
|
|
const std::shared_ptr<CodecOptions> codec_options( |
|
const std::shared_ptr<schema::ColumnPath>& path) const { |
|
return column_properties(path).codec_options(); |
|
} |
|
|
|
bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const { |
|
return column_properties(path).dictionary_enabled(); |
|
} |
|
|
|
const std::vector<SortingColumn>& sorting_columns() const { return sorting_columns_; } |
|
|
|
bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const { |
|
return column_properties(path).statistics_enabled(); |
|
} |
|
|
|
size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const { |
|
return column_properties(path).max_statistics_size(); |
|
} |
|
|
|
bool page_index_enabled(const std::shared_ptr<schema::ColumnPath>& path) const { |
|
return column_properties(path).page_index_enabled(); |
|
} |
|
|
|
bool page_index_enabled() const { |
|
if (default_column_properties_.page_index_enabled()) { |
|
return true; |
|
} |
|
for (const auto& item : column_properties_) { |
|
if (item.second.page_index_enabled()) { |
|
return true; |
|
} |
|
} |
|
return false; |
|
} |
|
|
|
inline FileEncryptionProperties* file_encryption_properties() const { |
|
return file_encryption_properties_.get(); |
|
} |
|
|
|
std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties( |
|
const std::string& path) const { |
|
if (file_encryption_properties_) { |
|
return file_encryption_properties_->column_encryption_properties(path); |
|
} else { |
|
return NULLPTR; |
|
} |
|
} |
|
|
|
|
|
const ColumnProperties& default_column_properties() const { |
|
return default_column_properties_; |
|
} |
|
|
|
private: |
|
explicit WriterProperties( |
|
MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, |
|
int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version, |
|
const std::string& created_by, bool page_write_checksum_enabled, |
|
SizeStatisticsLevel size_statistics_level, |
|
std::shared_ptr<FileEncryptionProperties> file_encryption_properties, |
|
const ColumnProperties& default_column_properties, |
|
const std::unordered_map<std::string, ColumnProperties>& column_properties, |
|
ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, |
|
std::vector<SortingColumn> sorting_columns) |
|
: pool_(pool), |
|
dictionary_pagesize_limit_(dictionary_pagesize_limit), |
|
write_batch_size_(write_batch_size), |
|
max_row_group_length_(max_row_group_length), |
|
pagesize_(pagesize), |
|
parquet_data_page_version_(data_page_version), |
|
parquet_version_(version), |
|
parquet_created_by_(created_by), |
|
store_decimal_as_integer_(store_short_decimal_as_integer), |
|
page_checksum_enabled_(page_write_checksum_enabled), |
|
size_statistics_level_(size_statistics_level), |
|
file_encryption_properties_(file_encryption_properties), |
|
sorting_columns_(std::move(sorting_columns)), |
|
default_column_properties_(default_column_properties), |
|
column_properties_(column_properties) {} |
|
|
|
MemoryPool* pool_; |
|
int64_t dictionary_pagesize_limit_; |
|
int64_t write_batch_size_; |
|
int64_t max_row_group_length_; |
|
int64_t pagesize_; |
|
ParquetDataPageVersion parquet_data_page_version_; |
|
ParquetVersion::type parquet_version_; |
|
std::string parquet_created_by_; |
|
bool store_decimal_as_integer_; |
|
bool page_checksum_enabled_; |
|
SizeStatisticsLevel size_statistics_level_; |
|
|
|
std::shared_ptr<FileEncryptionProperties> file_encryption_properties_; |
|
|
|
std::vector<SortingColumn> sorting_columns_; |
|
|
|
ColumnProperties default_column_properties_; |
|
std::unordered_map<std::string, ColumnProperties> column_properties_; |
|
}; |
|
|
|
PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties(); |
|
|
|
|
|
|
|
|
|
static constexpr bool kArrowDefaultUseThreads = false; |
|
|
|
|
|
static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024; |
|
|
|
|
|
class PARQUET_EXPORT ArrowReaderProperties { |
|
public: |
|
explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads) |
|
: use_threads_(use_threads), |
|
read_dict_indices_(), |
|
batch_size_(kArrowDefaultBatchSize), |
|
pre_buffer_(true), |
|
cache_options_(::arrow::io::CacheOptions::LazyDefaults()), |
|
coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO), |
|
arrow_extensions_enabled_(false), |
|
should_load_statistics_(false) {} |
|
|
|
|
|
|
|
|
|
void set_use_threads(bool use_threads) { use_threads_ = use_threads; } |
|
|
|
bool use_threads() const { return use_threads_; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void set_read_dictionary(int column_index, bool read_dict) { |
|
if (read_dict) { |
|
read_dict_indices_.insert(column_index); |
|
} else { |
|
read_dict_indices_.erase(column_index); |
|
} |
|
} |
|
|
|
bool read_dictionary(int column_index) const { |
|
if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) { |
|
return true; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; } |
|
|
|
|
|
|
|
int64_t batch_size() const { return batch_size_; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; } |
|
|
|
bool pre_buffer() const { return pre_buffer_; } |
|
|
|
|
|
|
|
void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; } |
|
|
|
const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; } |
|
|
|
|
|
void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; } |
|
|
|
const ::arrow::io::IOContext& io_context() const { return io_context_; } |
|
|
|
|
|
|
|
void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) { |
|
coerce_int96_timestamp_unit_ = unit; |
|
} |
|
|
|
::arrow::TimeUnit::type coerce_int96_timestamp_unit() const { |
|
return coerce_int96_timestamp_unit_; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void set_arrow_extensions_enabled(bool extensions_enabled) { |
|
arrow_extensions_enabled_ = extensions_enabled; |
|
} |
|
bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; } |
|
|
|
|
|
|
|
|
|
void set_should_load_statistics(bool should_load_statistics) { |
|
should_load_statistics_ = should_load_statistics; |
|
} |
|
|
|
bool should_load_statistics() const { return should_load_statistics_; } |
|
|
|
private: |
|
bool use_threads_; |
|
std::unordered_set<int> read_dict_indices_; |
|
int64_t batch_size_; |
|
bool pre_buffer_; |
|
::arrow::io::IOContext io_context_; |
|
::arrow::io::CacheOptions cache_options_; |
|
::arrow::TimeUnit::type coerce_int96_timestamp_unit_; |
|
bool arrow_extensions_enabled_; |
|
bool should_load_statistics_; |
|
}; |
|
|
|
|
|
PARQUET_EXPORT |
|
ArrowReaderProperties default_arrow_reader_properties(); |
|
|
|
class PARQUET_EXPORT ArrowWriterProperties { |
|
public: |
|
enum EngineVersion { |
|
V1, |
|
V2 |
|
}; |
|
class Builder { |
|
public: |
|
Builder() |
|
: write_timestamps_as_int96_(false), |
|
coerce_timestamps_enabled_(false), |
|
coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), |
|
truncated_timestamps_allowed_(false), |
|
store_schema_(false), |
|
compliant_nested_types_(true), |
|
engine_version_(V2), |
|
use_threads_(kArrowDefaultUseThreads), |
|
executor_(NULLPTR) {} |
|
virtual ~Builder() = default; |
|
|
|
|
|
Builder* disable_deprecated_int96_timestamps() { |
|
write_timestamps_as_int96_ = false; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
Builder* enable_deprecated_int96_timestamps() { |
|
write_timestamps_as_int96_ = true; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
Builder* coerce_timestamps(::arrow::TimeUnit::type unit) { |
|
coerce_timestamps_enabled_ = true; |
|
coerce_timestamps_unit_ = unit; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
Builder* allow_truncated_timestamps() { |
|
truncated_timestamps_allowed_ = true; |
|
return this; |
|
} |
|
|
|
|
|
Builder* disallow_truncated_timestamps() { |
|
truncated_timestamps_allowed_ = false; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
Builder* store_schema() { |
|
store_schema_ = true; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* enable_compliant_nested_types() { |
|
compliant_nested_types_ = true; |
|
return this; |
|
} |
|
|
|
|
|
Builder* disable_compliant_nested_types() { |
|
compliant_nested_types_ = false; |
|
return this; |
|
} |
|
|
|
|
|
Builder* set_engine_version(EngineVersion version) { |
|
engine_version_ = version; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Builder* set_use_threads(bool use_threads) { |
|
use_threads_ = use_threads; |
|
return this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
Builder* set_executor(::arrow::internal::Executor* executor) { |
|
executor_ = executor; |
|
return this; |
|
} |
|
|
|
|
|
std::shared_ptr<ArrowWriterProperties> build() { |
|
return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties( |
|
write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, |
|
truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, |
|
engine_version_, use_threads_, executor_)); |
|
} |
|
|
|
private: |
|
bool write_timestamps_as_int96_; |
|
|
|
bool coerce_timestamps_enabled_; |
|
::arrow::TimeUnit::type coerce_timestamps_unit_; |
|
bool truncated_timestamps_allowed_; |
|
|
|
bool store_schema_; |
|
bool compliant_nested_types_; |
|
EngineVersion engine_version_; |
|
|
|
bool use_threads_; |
|
::arrow::internal::Executor* executor_; |
|
}; |
|
|
|
bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } |
|
|
|
bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } |
|
::arrow::TimeUnit::type coerce_timestamps_unit() const { |
|
return coerce_timestamps_unit_; |
|
} |
|
|
|
bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; } |
|
|
|
bool store_schema() const { return store_schema_; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
bool compliant_nested_types() const { return compliant_nested_types_; } |
|
|
|
|
|
|
|
|
|
|
|
EngineVersion engine_version() const { return engine_version_; } |
|
|
|
|
|
|
|
bool use_threads() const { return use_threads_; } |
|
|
|
|
|
::arrow::internal::Executor* executor() const; |
|
|
|
private: |
|
explicit ArrowWriterProperties(bool write_nanos_as_int96, |
|
bool coerce_timestamps_enabled, |
|
::arrow::TimeUnit::type coerce_timestamps_unit, |
|
bool truncated_timestamps_allowed, bool store_schema, |
|
bool compliant_nested_types, |
|
EngineVersion engine_version, bool use_threads, |
|
::arrow::internal::Executor* executor) |
|
: write_timestamps_as_int96_(write_nanos_as_int96), |
|
coerce_timestamps_enabled_(coerce_timestamps_enabled), |
|
coerce_timestamps_unit_(coerce_timestamps_unit), |
|
truncated_timestamps_allowed_(truncated_timestamps_allowed), |
|
store_schema_(store_schema), |
|
compliant_nested_types_(compliant_nested_types), |
|
engine_version_(engine_version), |
|
use_threads_(use_threads), |
|
executor_(executor) {} |
|
|
|
const bool write_timestamps_as_int96_; |
|
const bool coerce_timestamps_enabled_; |
|
const ::arrow::TimeUnit::type coerce_timestamps_unit_; |
|
const bool truncated_timestamps_allowed_; |
|
const bool store_schema_; |
|
const bool compliant_nested_types_; |
|
const EngineVersion engine_version_; |
|
const bool use_threads_; |
|
::arrow::internal::Executor* executor_; |
|
}; |
|
|
|
|
|
|
|
struct ArrowWriteContext { |
|
ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties) |
|
: memory_pool(memory_pool), |
|
properties(properties), |
|
data_buffer(AllocateBuffer(memory_pool)), |
|
def_levels_buffer(AllocateBuffer(memory_pool)) {} |
|
|
|
template <typename T> |
|
::arrow::Status GetScratchData(const int64_t num_values, T** out) { |
|
ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false)); |
|
*out = reinterpret_cast<T*>(this->data_buffer->mutable_data()); |
|
return ::arrow::Status::OK(); |
|
} |
|
|
|
MemoryPool* memory_pool; |
|
const ArrowWriterProperties* properties; |
|
|
|
|
|
|
|
std::shared_ptr<ResizableBuffer> data_buffer; |
|
|
|
|
|
std::shared_ptr<ResizableBuffer> def_levels_buffer; |
|
}; |
|
|
|
PARQUET_EXPORT |
|
std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties(); |
|
|
|
} |
|
|