|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#include <cstdint> |
|
#include <map> |
|
#include <memory> |
|
#include <optional> |
|
#include <string> |
|
#include <utility> |
|
#include <vector> |
|
|
|
#include "parquet/encryption/type_fwd.h" |
|
#include "parquet/platform.h" |
|
#include "parquet/properties.h" |
|
|
|
namespace parquet { |
|
|
|
using KeyValueMetadata = ::arrow::KeyValueMetadata; |
|
|
|
class PARQUET_EXPORT ApplicationVersion { |
|
public: |
|
|
|
static const ApplicationVersion& PARQUET_251_FIXED_VERSION(); |
|
static const ApplicationVersion& PARQUET_816_FIXED_VERSION(); |
|
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION(); |
|
static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION(); |
|
static const ApplicationVersion& PARQUET_CPP_10353_FIXED_VERSION(); |
|
|
|
|
|
std::string application_; |
|
|
|
std::string build_; |
|
|
|
|
|
|
|
|
|
|
|
|
|
struct { |
|
int major; |
|
int minor; |
|
int patch; |
|
std::string unknown; |
|
std::string pre_release; |
|
std::string build_info; |
|
} version; |
|
|
|
ApplicationVersion() = default; |
|
explicit ApplicationVersion(const std::string& created_by); |
|
ApplicationVersion(std::string application, int major, int minor, int patch); |
|
|
|
|
|
bool VersionLt(const ApplicationVersion& other_version) const; |
|
|
|
|
|
bool VersionEq(const ApplicationVersion& other_version) const; |
|
|
|
|
|
bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics, |
|
SortOrder::type sort_order = SortOrder::SIGNED) const; |
|
}; |
|
|
|
class PARQUET_EXPORT ColumnCryptoMetaData { |
|
public: |
|
static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata); |
|
~ColumnCryptoMetaData(); |
|
|
|
bool Equals(const ColumnCryptoMetaData& other) const; |
|
|
|
std::shared_ptr<schema::ColumnPath> path_in_schema() const; |
|
bool encrypted_with_footer_key() const; |
|
const std::string& key_metadata() const; |
|
|
|
private: |
|
explicit ColumnCryptoMetaData(const uint8_t* metadata); |
|
|
|
class ColumnCryptoMetaDataImpl; |
|
std::unique_ptr<ColumnCryptoMetaDataImpl> impl_; |
|
}; |
|
|
|
|
|
struct PageEncodingStats { |
|
PageType::type page_type; |
|
Encoding::type encoding; |
|
int32_t count; |
|
}; |
|
|
|
|
|
struct IndexLocation { |
|
|
|
int64_t offset; |
|
|
|
int32_t length; |
|
}; |
|
|
|
|
|
class PARQUET_EXPORT ColumnChunkMetaData { |
|
public: |
|
|
|
static std::unique_ptr<ColumnChunkMetaData> Make( |
|
const void* metadata, const ColumnDescriptor* descr, |
|
const ReaderProperties& properties = default_reader_properties(), |
|
const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1, |
|
int16_t column_ordinal = -1, |
|
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR); |
|
|
|
~ColumnChunkMetaData(); |
|
|
|
bool Equals(const ColumnChunkMetaData& other) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int64_t file_offset() const; |
|
|
|
|
|
const std::string& file_path() const; |
|
|
|
|
|
bool is_metadata_set() const; |
|
Type::type type() const; |
|
int64_t num_values() const; |
|
std::shared_ptr<schema::ColumnPath> path_in_schema() const; |
|
bool is_stats_set() const; |
|
std::shared_ptr<Statistics> statistics() const; |
|
std::shared_ptr<SizeStatistics> size_statistics() const; |
|
|
|
Compression::type compression() const; |
|
|
|
|
|
bool can_decompress() const; |
|
|
|
const std::vector<Encoding::type>& encodings() const; |
|
const std::vector<PageEncodingStats>& encoding_stats() const; |
|
std::optional<int64_t> bloom_filter_offset() const; |
|
std::optional<int64_t> bloom_filter_length() const; |
|
bool has_dictionary_page() const; |
|
int64_t dictionary_page_offset() const; |
|
int64_t data_page_offset() const; |
|
bool has_index_page() const; |
|
int64_t index_page_offset() const; |
|
int64_t total_compressed_size() const; |
|
int64_t total_uncompressed_size() const; |
|
std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const; |
|
std::optional<IndexLocation> GetColumnIndexLocation() const; |
|
std::optional<IndexLocation> GetOffsetIndexLocation() const; |
|
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const; |
|
|
|
private: |
|
explicit ColumnChunkMetaData( |
|
const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal, |
|
int16_t column_ordinal, const ReaderProperties& properties, |
|
const ApplicationVersion* writer_version = NULLPTR, |
|
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR); |
|
|
|
class ColumnChunkMetaDataImpl; |
|
std::unique_ptr<ColumnChunkMetaDataImpl> impl_; |
|
}; |
|
|
|
|
|
class PARQUET_EXPORT RowGroupMetaData { |
|
public: |
|
|
|
static std::unique_ptr<RowGroupMetaData> Make( |
|
const void* metadata, const SchemaDescriptor* schema, |
|
const ReaderProperties& properties = default_reader_properties(), |
|
const ApplicationVersion* writer_version = NULLPTR, |
|
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR); |
|
|
|
~RowGroupMetaData(); |
|
|
|
bool Equals(const RowGroupMetaData& other) const; |
|
|
|
|
|
|
|
int num_columns() const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const; |
|
|
|
|
|
int64_t num_rows() const; |
|
|
|
|
|
int64_t total_byte_size() const; |
|
|
|
|
|
|
|
|
|
|
|
int64_t total_compressed_size() const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
int64_t file_offset() const; |
|
|
|
const SchemaDescriptor* schema() const; |
|
|
|
bool can_decompress() const; |
|
|
|
std::vector<SortingColumn> sorting_columns() const; |
|
|
|
private: |
|
explicit RowGroupMetaData( |
|
const void* metadata, const SchemaDescriptor* schema, |
|
const ReaderProperties& properties, |
|
const ApplicationVersion* writer_version = NULLPTR, |
|
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR); |
|
|
|
class RowGroupMetaDataImpl; |
|
std::unique_ptr<RowGroupMetaDataImpl> impl_; |
|
}; |
|
|
|
class FileMetaDataBuilder; |
|
|
|
|
|
class PARQUET_EXPORT FileMetaData { |
|
public: |
|
|
|
static std::shared_ptr<FileMetaData> Make( |
|
const void* serialized_metadata, uint32_t* inout_metadata_len, |
|
const ReaderProperties& properties = default_reader_properties(), |
|
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR); |
|
|
|
~FileMetaData(); |
|
|
|
bool Equals(const FileMetaData& other) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int num_columns() const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
int num_schema_elements() const; |
|
|
|
|
|
|
|
|
|
|
|
int64_t num_rows() const; |
|
|
|
|
|
|
|
|
|
|
|
int num_row_groups() const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::unique_ptr<RowGroupMetaData> RowGroup(int index) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ParquetVersion::type version() const; |
|
|
|
|
|
const std::string& created_by() const; |
|
|
|
|
|
const ApplicationVersion& writer_version() const; |
|
|
|
|
|
uint32_t size() const; |
|
|
|
|
|
|
|
|
|
|
|
bool can_decompress() const; |
|
|
|
bool is_encryption_algorithm_set() const; |
|
EncryptionAlgorithm encryption_algorithm() const; |
|
const std::string& footer_signing_key_metadata() const; |
|
|
|
|
|
|
|
bool VerifySignature(const void* signature); |
|
|
|
void WriteTo(::arrow::io::OutputStream* dst, |
|
const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const; |
|
|
|
|
|
|
|
std::string SerializeToString() const; |
|
|
|
|
|
const SchemaDescriptor* schema() const; |
|
|
|
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void set_file_path(const std::string& path); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void AppendRowGroups(const FileMetaData& other); |
|
|
|
|
|
|
|
std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const; |
|
|
|
|
|
|
|
|
|
|
|
|
|
std::string SerializeUnencrypted(bool scrub, bool debug) const; |
|
|
|
private: |
|
friend FileMetaDataBuilder; |
|
friend class SerializedFile; |
|
friend class SerializedRowGroup; |
|
|
|
explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, |
|
const ReaderProperties& properties, |
|
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR); |
|
|
|
void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor); |
|
const std::shared_ptr<InternalFileDecryptor>& file_decryptor() const; |
|
|
|
|
|
FileMetaData(); |
|
class FileMetaDataImpl; |
|
std::unique_ptr<FileMetaDataImpl> impl_; |
|
}; |
|
|
|
class PARQUET_EXPORT FileCryptoMetaData { |
|
public: |
|
|
|
static std::shared_ptr<FileCryptoMetaData> Make( |
|
const uint8_t* serialized_metadata, uint32_t* metadata_len, |
|
const ReaderProperties& properties = default_reader_properties()); |
|
~FileCryptoMetaData(); |
|
|
|
EncryptionAlgorithm encryption_algorithm() const; |
|
const std::string& key_metadata() const; |
|
|
|
void WriteTo(::arrow::io::OutputStream* dst) const; |
|
|
|
private: |
|
friend FileMetaDataBuilder; |
|
FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len, |
|
const ReaderProperties& properties); |
|
|
|
|
|
FileCryptoMetaData(); |
|
class FileCryptoMetaDataImpl; |
|
std::unique_ptr<FileCryptoMetaDataImpl> impl_; |
|
}; |
|
|
|
|
|
class PARQUET_EXPORT ColumnChunkMetaDataBuilder { |
|
public: |
|
|
|
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make( |
|
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column); |
|
|
|
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make( |
|
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column, |
|
void* contents); |
|
|
|
~ColumnChunkMetaDataBuilder(); |
|
|
|
|
|
|
|
void set_file_path(const std::string& path); |
|
|
|
|
|
void SetStatistics(const EncodedStatistics& stats); |
|
void SetSizeStatistics(const SizeStatistics& size_stats); |
|
|
|
void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata> key_value_metadata); |
|
|
|
|
|
const ColumnDescriptor* descr() const; |
|
|
|
int64_t total_compressed_size() const; |
|
|
|
|
|
void Finish(int64_t num_values, int64_t dictionary_page_offset, |
|
int64_t index_page_offset, int64_t data_page_offset, |
|
int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary, |
|
bool dictionary_fallback, |
|
const std::map<Encoding::type, int32_t>& dict_encoding_stats_, |
|
const std::map<Encoding::type, int32_t>& data_encoding_stats_, |
|
const std::shared_ptr<Encryptor>& encryptor = NULLPTR); |
|
|
|
|
|
const void* contents() const; |
|
|
|
|
|
void WriteTo(::arrow::io::OutputStream* sink); |
|
|
|
private: |
|
explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props, |
|
const ColumnDescriptor* column); |
|
explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props, |
|
const ColumnDescriptor* column, void* contents); |
|
|
|
class ColumnChunkMetaDataBuilderImpl; |
|
std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_; |
|
}; |
|
|
|
class PARQUET_EXPORT RowGroupMetaDataBuilder { |
|
public: |
|
|
|
static std::unique_ptr<RowGroupMetaDataBuilder> Make( |
|
std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_, |
|
void* contents); |
|
|
|
~RowGroupMetaDataBuilder(); |
|
|
|
ColumnChunkMetaDataBuilder* NextColumnChunk(); |
|
int num_columns(); |
|
int64_t num_rows(); |
|
int current_column() const; |
|
|
|
void set_num_rows(int64_t num_rows); |
|
|
|
|
|
void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1); |
|
|
|
private: |
|
explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props, |
|
const SchemaDescriptor* schema_, void* contents); |
|
|
|
class RowGroupMetaDataBuilderImpl; |
|
std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_; |
|
}; |
|
|
|
|
|
struct PageIndexLocation { |
|
|
|
|
|
|
|
using RowGroupIndexLocation = std::vector<std::optional<IndexLocation>>; |
|
|
|
|
|
using FileIndexLocation = std::map<size_t, RowGroupIndexLocation>; |
|
|
|
FileIndexLocation column_index_location; |
|
|
|
FileIndexLocation offset_index_location; |
|
}; |
|
|
|
class PARQUET_EXPORT FileMetaDataBuilder { |
|
public: |
|
|
|
static std::unique_ptr<FileMetaDataBuilder> Make( |
|
const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props); |
|
|
|
~FileMetaDataBuilder(); |
|
|
|
|
|
RowGroupMetaDataBuilder* AppendRowGroup(); |
|
|
|
|
|
void SetPageIndexLocation(const PageIndexLocation& location); |
|
|
|
|
|
std::unique_ptr<FileMetaData> Finish( |
|
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
|
|
|
|
|
std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData(); |
|
|
|
private: |
|
explicit FileMetaDataBuilder(const SchemaDescriptor* schema, |
|
std::shared_ptr<WriterProperties> props); |
|
|
|
class FileMetaDataBuilderImpl; |
|
std::unique_ptr<FileMetaDataBuilderImpl> impl_; |
|
}; |
|
|
|
PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver); |
|
|
|
} |
|
|