|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#include <cstdint> |
|
#include <cstring> |
|
#include <memory> |
|
#include <vector> |
|
|
|
#include "arrow/type_fwd.h" |
|
|
|
#include "parquet/exception.h" |
|
#include "parquet/platform.h" |
|
#include "parquet/types.h" |
|
|
|
namespace arrow { |
|
template <typename T> |
|
class Dictionary32Builder; |
|
} |
|
|
|
namespace parquet { |
|
|
|
template <typename DType> |
|
class TypedEncoder; |
|
|
|
using BooleanEncoder = TypedEncoder<BooleanType>; |
|
using Int32Encoder = TypedEncoder<Int32Type>; |
|
using Int64Encoder = TypedEncoder<Int64Type>; |
|
using Int96Encoder = TypedEncoder<Int96Type>; |
|
using FloatEncoder = TypedEncoder<FloatType>; |
|
using DoubleEncoder = TypedEncoder<DoubleType>; |
|
using ByteArrayEncoder = TypedEncoder<ByteArrayType>; |
|
using FLBAEncoder = TypedEncoder<FLBAType>; |
|
|
|
template <typename DType> |
|
class TypedDecoder; |
|
|
|
class BooleanDecoder; |
|
using Int32Decoder = TypedDecoder<Int32Type>; |
|
using Int64Decoder = TypedDecoder<Int64Type>; |
|
using Int96Decoder = TypedDecoder<Int96Type>; |
|
using FloatDecoder = TypedDecoder<FloatType>; |
|
using DoubleDecoder = TypedDecoder<DoubleType>; |
|
using ByteArrayDecoder = TypedDecoder<ByteArrayType>; |
|
class FLBADecoder; |
|
|
|
template <typename T> |
|
struct EncodingTraits; |
|
|
|
template <> |
|
struct EncodingTraits<BooleanType> { |
|
using Encoder = BooleanEncoder; |
|
using Decoder = BooleanDecoder; |
|
|
|
using ArrowType = ::arrow::BooleanType; |
|
using Accumulator = ::arrow::BooleanBuilder; |
|
struct DictAccumulator {}; |
|
}; |
|
|
|
template <> |
|
struct EncodingTraits<Int32Type> { |
|
using Encoder = Int32Encoder; |
|
using Decoder = Int32Decoder; |
|
|
|
using ArrowType = ::arrow::Int32Type; |
|
using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>; |
|
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>; |
|
}; |
|
|
|
template <> |
|
struct EncodingTraits<Int64Type> { |
|
using Encoder = Int64Encoder; |
|
using Decoder = Int64Decoder; |
|
|
|
using ArrowType = ::arrow::Int64Type; |
|
using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>; |
|
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>; |
|
}; |
|
|
|
template <> |
|
struct EncodingTraits<Int96Type> { |
|
using Encoder = Int96Encoder; |
|
using Decoder = Int96Decoder; |
|
|
|
struct Accumulator {}; |
|
struct DictAccumulator {}; |
|
}; |
|
|
|
template <> |
|
struct EncodingTraits<FloatType> { |
|
using Encoder = FloatEncoder; |
|
using Decoder = FloatDecoder; |
|
|
|
using ArrowType = ::arrow::FloatType; |
|
using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>; |
|
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>; |
|
}; |
|
|
|
template <> |
|
struct EncodingTraits<DoubleType> { |
|
using Encoder = DoubleEncoder; |
|
using Decoder = DoubleDecoder; |
|
|
|
using ArrowType = ::arrow::DoubleType; |
|
using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>; |
|
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>; |
|
}; |
|
|
|
template <> |
|
struct EncodingTraits<ByteArrayType> { |
|
using Encoder = ByteArrayEncoder; |
|
using Decoder = ByteArrayDecoder; |
|
|
|
using ArrowType = ::arrow::BinaryType; |
|
|
|
|
|
struct Accumulator { |
|
std::unique_ptr<::arrow::BinaryBuilder> builder; |
|
std::vector<std::shared_ptr<::arrow::Array>> chunks; |
|
}; |
|
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>; |
|
}; |
|
|
|
template <> |
|
struct EncodingTraits<FLBAType> { |
|
using Encoder = FLBAEncoder; |
|
using Decoder = FLBADecoder; |
|
|
|
using ArrowType = ::arrow::FixedSizeBinaryType; |
|
using Accumulator = ::arrow::FixedSizeBinaryBuilder; |
|
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>; |
|
}; |
|
|
|
class ColumnDescriptor; |
|
|
|
|
|
class Encoder { |
|
public: |
|
virtual ~Encoder() = default; |
|
|
|
virtual int64_t EstimatedDataEncodedSize() = 0; |
|
virtual std::shared_ptr<Buffer> FlushValues() = 0; |
|
virtual Encoding::type encoding() const = 0; |
|
|
|
virtual void Put(const ::arrow::Array& values) = 0; |
|
|
|
|
|
|
|
|
|
virtual int64_t ReportUnencodedDataBytes() = 0; |
|
|
|
virtual MemoryPool* memory_pool() const = 0; |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
template <typename DType> |
|
class TypedEncoder : virtual public Encoder { |
|
public: |
|
using T = typename DType::c_type; |
|
|
|
using Encoder::Put; |
|
|
|
virtual void Put(const T* src, int num_values) = 0; |
|
|
|
virtual void Put(const std::vector<T>& src, int num_values = -1); |
|
|
|
virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, |
|
int64_t valid_bits_offset) = 0; |
|
}; |
|
|
|
template <typename DType> |
|
void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) { |
|
if (num_values == -1) { |
|
num_values = static_cast<int>(src.size()); |
|
} |
|
Put(src.data(), num_values); |
|
} |
|
|
|
template <> |
|
inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) { |
|
|
|
|
|
} |
|
|
|
|
|
template <typename DType> |
|
class DictEncoder : virtual public TypedEncoder<DType> { |
|
public: |
|
|
|
|
|
|
|
|
|
|
|
virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0; |
|
|
|
virtual int dict_encoded_size() const = 0; |
|
|
|
virtual int bit_width() const = 0; |
|
|
|
|
|
|
|
virtual void WriteDict(uint8_t* buffer) const = 0; |
|
|
|
virtual int num_entries() const = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual void PutIndices(const ::arrow::Array& indices) = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual void PutDictionary(const ::arrow::Array& values) = 0; |
|
}; |
|
|
|
|
|
|
|
|
|
class Decoder { |
|
public: |
|
virtual ~Decoder() = default; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual void SetData(int num_values, const uint8_t* data, int len) = 0; |
|
|
|
|
|
|
|
virtual int values_left() const = 0; |
|
virtual Encoding::type encoding() const = 0; |
|
}; |
|
|
|
template <typename DType> |
|
class TypedDecoder : virtual public Decoder { |
|
public: |
|
using T = typename DType::c_type; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual int Decode(T* buffer, int max_values) = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual int DecodeSpaced(T* buffer, int num_values, int null_count, |
|
const uint8_t* valid_bits, int64_t valid_bits_offset) = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, |
|
int64_t valid_bits_offset, |
|
typename EncodingTraits<DType>::Accumulator* out) = 0; |
|
|
|
|
|
|
|
|
|
int DecodeArrowNonNull(int num_values, |
|
typename EncodingTraits<DType>::Accumulator* out) { |
|
return DecodeArrow(num_values, 0, NULLPTR, 0, out); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, |
|
int64_t valid_bits_offset, |
|
typename EncodingTraits<DType>::DictAccumulator* builder) = 0; |
|
|
|
|
|
|
|
|
|
int DecodeArrowNonNull(int num_values, |
|
typename EncodingTraits<DType>::DictAccumulator* builder) { |
|
return DecodeArrow(num_values, 0, NULLPTR, 0, builder); |
|
} |
|
}; |
|
|
|
template <typename DType> |
|
class DictDecoder : virtual public TypedDecoder<DType> { |
|
public: |
|
using T = typename DType::c_type; |
|
|
|
virtual void SetDict(TypedDecoder<DType>* dictionary) = 0; |
|
|
|
|
|
|
|
virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual int DecodeIndicesSpaced(int num_values, int null_count, |
|
const uint8_t* valid_bits, int64_t valid_bits_offset, |
|
::arrow::ArrayBuilder* builder) = 0; |
|
|
|
|
|
|
|
|
|
|
|
virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0; |
|
|
|
|
|
|
|
|
|
|
|
virtual int DecodeIndices(int num_values, int32_t* indices) = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0; |
|
}; |
|
|
|
|
|
|
|
|
|
class BooleanDecoder : virtual public TypedDecoder<BooleanType> { |
|
public: |
|
using TypedDecoder<BooleanType>::Decode; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual int Decode(uint8_t* buffer, int max_values) = 0; |
|
}; |
|
|
|
class FLBADecoder : virtual public TypedDecoder<FLBAType> { |
|
public: |
|
using TypedDecoder<FLBAType>::DecodeSpaced; |
|
|
|
|
|
|
|
|
|
|
|
}; |
|
|
|
PARQUET_EXPORT |
|
std::unique_ptr<Encoder> MakeEncoder( |
|
Type::type type_num, Encoding::type encoding, bool use_dictionary = false, |
|
const ColumnDescriptor* descr = NULLPTR, |
|
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); |
|
|
|
template <typename DType> |
|
std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder( |
|
Encoding::type encoding, bool use_dictionary = false, |
|
const ColumnDescriptor* descr = NULLPTR, |
|
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { |
|
using OutType = typename EncodingTraits<DType>::Encoder; |
|
std::unique_ptr<Encoder> base = |
|
MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool); |
|
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release())); |
|
} |
|
|
|
PARQUET_EXPORT |
|
std::unique_ptr<Decoder> MakeDecoder( |
|
Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR, |
|
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); |
|
|
|
namespace detail { |
|
|
|
PARQUET_EXPORT |
|
std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num, |
|
const ColumnDescriptor* descr, |
|
::arrow::MemoryPool* pool); |
|
|
|
} |
|
|
|
template <typename DType> |
|
std::unique_ptr<DictDecoder<DType>> MakeDictDecoder( |
|
const ColumnDescriptor* descr = NULLPTR, |
|
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { |
|
using OutType = DictDecoder<DType>; |
|
auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool); |
|
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release())); |
|
} |
|
|
|
template <typename DType> |
|
std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder( |
|
Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR, |
|
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { |
|
using OutType = typename EncodingTraits<DType>::Decoder; |
|
std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool); |
|
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release())); |
|
} |
|
|
|
} |
|
|